Add a cleaner for subtitles. - annna - Annna the nice friendly bot.
 (HTM) git clone git://bitreich.org/annna/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/annna/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
       ---
 (DIR) commit 30eb06f31d2a42e280ea01521f6baa1bca2fec33
 (DIR) parent a622486a185d90ca0225311dbc9e88a9fbab5994
 (HTM) Author: Annna Robert-Houdin <annna@bitreich.org>
       Date:   Sun, 10 May 2020 18:57:47 +0200
       
       Add a cleaner for subtitles.
       
       Be careful, it is under Mafia Domain.
       
       Thanks leot!
       
       Diffstat:
         M subtitle-paste                      |       3 +++
         A ytautosubcleaner.awk                |      51 +++++++++++++++++++++++++++++++
       
       2 files changed, 54 insertions(+), 0 deletions(-)
       ---
 (DIR) diff --git a/subtitle-paste b/subtitle-paste
       @@ -22,6 +22,9 @@ if [ $(stat -c%s "${ofile}") -eq 0 ];
        then
                rm "${ofile}"
        else
       +        # Make it more human readable.
       +        awk -f /home/annna/bin/ytautosubcleaner.awk < "${ofile}" > "${ofile}.bak"
       +        mv "${ofile}.bak" "${ofile}"
                printf "gopher://bitreich.org/0/p/%s\n" "${ofile}"
        fi
        
 (DIR) diff --git a/ytautosubcleaner.awk b/ytautosubcleaner.awk
       @@ -0,0 +1,51 @@
       +#!/usr/bin/awk -f
       +
       +# This file is licensed under Mafia Domain. So be careful.
       +
       +#
       +# Make YouTube automatic subtitles more human readable
       +#
       +
       +/^[0-9]+:[0-9]+:[0-9]+\.[0-9]+ -->/ {
       +        sub(/ align:start position:0%$/, "")
       +
       +        if (caption && split(caption, lines, "\n") > 2) {
       +                print caption
       +        }
       +
       +        ignore = 0
       +        caption = $0
       +        next
       +}
       +
       +ignore {
       +        next
       +}
       +
       +# skip extra empty lines
       +/^ +$/ {
       +        next
       +}
       +
       +# current caption contains <c> garbage and will be repeated in the next
       +# caption, ignore it
       +caption && /><c>/ {
       +        ignore = 1
       +        caption = ""
       +        next
       +}
       +
       +caption {
       +        caption = caption "\n" $0
       +        next
       +}
       +
       +{
       +        print
       +}
       +
       +END {
       +        if (caption && split(caption, lines, "\n") > 2) {
       +                print caption
       +        }
       +}