# Convert web page with JSON-LD Recipe data to Meal-Master format.
# Version 14
#
# Usage: awk -f jsontomm.awk infile.htm outbase
# Creates outbase.js and outbase.txt
#
# Hints:
#
# To fetch a recipe, use curl:
# curl --insecure -o web.htm URL
#
# On DOS, use GAWK.EXE.
#
# This script depends on json2tsv being in your path.
#
# Use MMBUSTER.EXE to convert outbase.txt to outbase.mmf for import into
# Meal-Master.
#
# JSON-LD spec:
# https://developers.google.com/search/docs/appearance/structured-data/recipe

function array_size(a,     retval) {
    retval = 0
    for (i in a) {
        retval++
    }
    return retval
}

function capitalize_ingredient(line,     i, retval) {
     retval = line
     result = match(line, /^ *[[:digit:][:space:]\/.]+ +((bn|c|cl|ds|g|kg|l|lb|lg|md|ml|oz|pn|qt|sm|tb|ts) +)*/)
     if (result) {
        before = substr(line, 0, RLENGTH)
        c = substr(line, RLENGTH+1, 1)
        after = substr(line, RLENGTH+2)
        retval = before toupper(c) after
     } else {
        c = substr(line, 0, 1)
        after = substr(line, 2)
        retval = toupper(c) after
     }
     return retval
}

function chop(str,     retval) {
    retval = substr(str, 0, length(str) - 1)
    return retval
}

function escape_regex(str) {
    gsub(/\[/, "\\[", str)
    gsub(/\./, "\\.", str)
    return str
}

function export_tsv(infile, tsvfile,     result) {
    cmd = "json2tsv -n < " infile " > " tsvfile
    result = system(cmd)
    if (result != 0) {
        printf "Error: json2tsv error code: %d\n", result
        exit 1
    }
    return
}

function export_txt(infile, outfile,     i, j, key, name) {
    # JSON-LD Recipe format can be a simple object, or an array of objects,
    # or a graph, or a combination of all of these.  Just traverse the
    # whole tree looking for an object where @type either is "Recipe" or
    # is an array containing "Recipe".

    base = select(infile, "", QUERY_RECIPE_BASE)
    if (length(base) == 0) {
        print "Error: No JSON-LD Recipe object found"
        exit 1
    }
    base = escape_regex(base)

    # if no title, fail early.  There might be no JSON-LD data.
    title = select(infile, "^" base "\\.name$", QUERY_STR)
    if (length(title) == 0) {
        print "Error: Could not find recipe title"
        exit 1
    }
    gsub(/^Recipe: /, "", title)
    title = format_title_case(title)

    # author can be a string, or an object, or an array of objects,
    # and an object may directly represent an author,
    # or an object may indirectly reference another document via URI/UUID
    author_type = select(infile, base, QUERY_AUTHOR_TYPE)
    author = ""
    if (author_type == "s") {
        author = select(infile, "^" base "\\.author$", QUERY_STR)
    } else if (author_type == "o") {
        author = get_author(infile, base "\\.author")
    } else if (author_type == "a") {
        author = ""
        authors_len = select(infile, base, QUERY_AUTHORS_LEN)
        for (i = 0; i < authors_len; i++) {
            key = base "\\.author\\[" i "]"
            name = get_author(infile, key)
            if (length(author) == 0) {
                author = name
            } else {
                author = author "," name
            }
        }
    }

    descr = select(infile, "^" base "\\.description", QUERY_STR)
    descr = format_units(descr, 0)

    cook_time = select(infile, "^" base "\\.cookTime", QUERY_STR)
    cook_time = format_iso8601_dur(cook_time)

    prep_time = select(infile, "^" base "\\.prepTime", QUERY_STR)
    if (prep_time == "PT0M") {
        prep_time = ""
    }
    prep_time = format_iso8601_dur(prep_time)

    delete categories
    get_arr(infile, "^" base "\\.recipeCategory", categories)
    category = format_list(categories)

    delete cuisines
    get_arr(infile, "^" base "\\.recipeCuisine", cuisines)
    cuisine = format_list(cuisines)

    if (length(category) == 0) {
        category = cuisine
    } else {
        if (length(cuisine) > 0) {
            category = category "," cuisine
        }
    }

    category = format_units(category, 0)

    # ingredients is an array of strings
    # unlike Meal-Master, ingredients cannot have sections
    delete ingredients
    get_arr(infile, "^" base "\\.recipeIngredient", ingredients)

    # instructions is an array of strings, HowToSections, and/or HowToSteps

    # get instructions strings
    delete instructions
    get_arr(infile, "^" base ".recipeInstructions(\\[[0-9]+])?$", instructions)

    # get instructions HowToSections
    delete section_names
    delete section_numsteps
    delete section_paths
    delete sections
    key = "^" base "\\.recipeInstructions(\\[[0-9]+])?\\.@type$"
    get_section_paths(infile, key, section_paths)
    section_count = array_size(section_paths)
    for (i = 0; i < section_count; i++) {
        name = select(infile, "^" section_paths[i] "\\.name$", QUERY_STR)
        section_names[i] = trim_item(name)
        delete steps
        get_arr(infile, "^" section_paths[i] ".*\\.text$", steps)
        len = array_size(steps)
        section_numsteps[i] = len
        for (j = 0; j < len; j++) {
            sections[i,j] = steps[j]
        }
    }

    # get instructions HowToSteps
    delete steps_paths
    key = "^" base "\\.recipeInstructions\\[[0-9]+]\\.@type"
    get_step_paths(infile, key, steps_paths)

    len = array_size(steps_paths)
    if (len > 0) {
        i = section_count
        section_count++
        section_names[i] = ""
        section_numsteps[i] = len
        for (j = 0; j < len; j++) {
            name = select(infile, "^" steps_paths[j] "\\.name", QUERY_STR)
            step = select(infile, "^" steps_paths[j] "\\.text", QUERY_STR)
            if (length(name) > 0 && name != step && !match(name, /\.\.\.$/)) {
                step = name " " step
            }
            sections[i,j] = step
        }
    }

    # yield can be a scalar, or it can be an array of scalars
    yield = select(infile, "^" base "\\.recipeYield", QUERY_SCALAR_LAST)
    yield = trim_item(yield)
    gsub(/people/,     "Servings", yield)
    gsub(/serving(s)/, "Servings", yield)
    if (match(yield, /^Serves /)) {
        yield = substr(yield, RSTART+RLENGTH) " Servings"
    }
    if (yield ~ /^[0-9]+$/) {
        yield = yield " Servings"
    }

    print "[[[[[" >outfile
    print title >>outfile
    if (length(category) > 0) {
        printf "C: %s\n", category >>outfile
    }
    if (length(author) > 0) {
        printf "B: %s\n", author >>outfile
    }
    if (length(yield) > 0) {
        printf "Y: %s\n", yield >>outfile
    }
    print "" >>outfile
    print_ingredients(ingredients, outfile)
    print "" >>outfile
    if (length(prep_time) > 0) {
        printf "Preparation time: %s\n", prep_time >>outfile
    }
    if (length(cook_time) > 0) {
        printf "Cooking time: %s\n", cook_time >>outfile
    }
    print "" >>outfile
    print_wrap(descr, wraplen, outfile)
    print "" >>outfile
    print_instructions(instructions, outfile)
    print_sections(sections, section_names, section_numsteps, outfile)
    print "]]]]]" >>outfile
    close(outfile)
    return
}

function extract_js(infile, outfile) {
    inscript = 0

    printf "[" >outfile
    while (getline <infile) {
        inscript = print_innertext(inscript, $0, outfile)
    }
    if (inscript) {
        print $0 >>outfile
        print "," >>outfile
    }
    print "{\"caboose\":\"ding ding\"}" >>outfile
    print "]" >>outfile
    close(infile)
    close(outfile)
    return
}

function file_exists(name,     retval) {
    retval = 0
    if ((getline < name) > 0) {
        retval = 1
    }
    close(name)
    return retval
}

function format_iso8601_dur(str,     retval) {
    retval = str
    if (str ~ /^PT/) {
        retval = substr(retval, 3)
        gsub(/H/, " hours ", retval)
        gsub(/M/, " minutes ", retval)
    }
    return retval
}

function format_list(array,     i, retval) {
    len = array_size(array)
    retval = ""
    for (i = 0; i < len; i++) {
        line = array[i]
        if (line == "[" || line == "]") {
            continue
        }
        line = trim_item(line)
        if (length(retval) == 0) {
            retval = line
        } else {
            retval = retval "," line
        }
    }
    return retval
}

function format_title_case(str,     i, retval) {
    retval = ""
    n = split(str, fields, / /)
    for (i = 1; i <= n; i++) {
        word = fields[i]
        if (length(word) == 1) {
            word = toupper(word)
        } else {
            first = substr(word, 0, 1)
            last = substr(word, 2)
            word = toupper(first) last
        }
        if (length(retval) == 0) {
            retval = word
        } else {
            retval = retval " " word
        }
   }
   return retval
}

function format_units(str, in_ingredients,     i, n) {
    # convert funny HTML characters to ASCII
    gsub(/&#x27;/,     "'", str)
    gsub(/&#0?39;/,    "'", str)
    gsub(/&#8217;/,    "'", str)
    gsub(/&#215;/,     "x", str)
    gsub(/&amp;nbsp;/, " ", str)
    gsub(/&nbsp;/,     " ", str)
    gsub(/&egrave;/,   "è", str)
    gsub(/&icirc;/,    "î", str)
    gsub(/&amp;/,      "\\&", str)
    gsub(/&quot;/,     "\"", str)
    gsub(/&#822[01];/, "\"", str)

    # convert escaped characters to ASCII
    gsub(/\\n/, "\n", str)
    gsub(/\\r/, "", str)

    # convert funny Unicode characters to ASCII
    gsub(/^½/, "1/2", str)
    gsub(/^⅓/, "1/3", str)
    gsub(/^¼/, "1/4", str)
    gsub(/^⅛/, "1/8", str)
    gsub(/^⅔/, "2/3", str)
    gsub(/^¾/, "3/4", str)
    gsub(/ ½/, " 1/2", str)
    gsub(/ ⅓/, " 1/3", str)
    gsub(/ ¼/, " 1/4", str)
    gsub(/ ⅛/, " 1/8", str)
    gsub(/ ⅔/, " 2/3", str)
    gsub(/ ¾/, " 3/4", str)
    gsub(/½/, "-1/2", str)
    gsub(/⅓/, "-1/3", str)
    gsub(/¼/, "-1/4", str)
    gsub(/⅛/, "-1/8", str)
    gsub(/⅔/, "-2/3", str)
    gsub(/¾/, "-3/4", str)
    gsub(/º/, "°",    str)
    gsub(/‘/, "'",    str)
    gsub(/’/, "'",    str)
    gsub(/“/, "\"",   str)
    gsub(/”/, "\"",   str)
    gsub(/…/, "...",  str)
    gsub(/–/, "-",    str)
    gsub(/—/, "--",   str)
    gsub(/⁄/, "/",    str)

    # replace in (inch abbreviation) with " symbol
    while (match(str, /[[:digit:]]in /)) {
        before = substr(str, 0, RSTART)
        after = substr(str, RSTART+3)
        str = before "\"" after
    }

    # insert space between digit and letter
    # or between ingredient and reference
    # it is important to do this step AFTER converting Unicode
    # characters, because some of the conversions change the
    # character class.

    while (match(str, /[[:digit:]][[:alpha:]]/) ||
        match(str, /[^[:space:]][*]/))
    {
        before = substr(str, 0, RSTART) 
        after = substr(str, RSTART+1)
        str = before " " after
    }

    # convert to standard unit abbreviations
    gsub(/ T /,               " tb ", str)
    gsub(/ t /,               " ts ", str)
    gsub(/ [Tt]ablespoons? /, " tb ", str)
    gsub(/ [Tt]bsps? /,       " tb ", str)
    gsub(/ [Tt]easpoons? /,   " ts ", str)
    gsub(/ [Tt]sps? /,        " ts ", str)

    if (in_ingredients) {
        gsub(/^ +/,        "",        str)
        gsub(/  +/,        " ",       str)
        gsub(/ bunches /,  " bn ",    str)
        gsub(/ bunch /,    " bn ",    str)
        gsub(/ cups? /,    " c ",     str)
        gsub(/ C /,        " c ",     str)
        gsub(/ gms? /,     " g ",     str)
        gsub(/ large /,    " lg ",    str)
        gsub(/ lb\. /,     " lb ",    str)
        gsub(/ medium /,   " md ",    str)
        gsub(/ ounces? /,  " oz ",    str)
        gsub(/ pinch /,    " pn ",    str)
        gsub(/^pinch /,    "1 pn ",   str)
        gsub(/\npinch /,   "\n1 pn ", str)
        gsub(/ ounces/,    " oz",     str)
        gsub(/ ounce/,     " oz",     str)
        gsub(/-ounce/,     " oz",     str)
        gsub(/ pounds? /,  " lb ",    str)
        gsub(/ quarts? /,  " qt ",    str)
        gsub(/ small /,    " sm ",    str)
        gsub(/,/,          ";",       str)
        gsub(/\.00 /,      " ",       str)
        gsub(/^0\.75 /,    "3/4 ",    str)
        gsub(/^0\.6[67] /, "2/3 ",    str)
        gsub(/^0\.50? /,   "1/2 ",    str)
        gsub(/^0\.33 /,    "1/3 ",    str)
        gsub(/^0\.25 /,    "1/4 ",    str)
        gsub(/^0\.1[23] /, "1/8 ",    str)
        # change 1 gram flour to 1 g flour, but leave 1 c gram flour as is
        if (match(str, /gram [Ff]lour/)) {
            if (match(str, /[[:digit:]][[:space:]]+gram [Ff]lour/)) {
                gsub(/ gram /,   " g ",     str)
            }
        } else {
            gsub(/ grams? /,   " g ",     str)
        }
    }

    # convert 1/3 C to 1/3 c, but leave 300 C as is
    if (match(str, /[^[:digit:]][[:digit:]] C /)) {
        before = substr(str, 0, RSTART+1)
        after = substr(str, RSTART+5)
        str = before " c " after
    }

    # insert space between unit of measure and slash and next quantity
    while (match(str, /[[:alpha:]]\/[0-9]/)) {
        before = substr(str, 0, RSTART)
        after = substr(str, RSTART+2)
        str = before " / " after
    }
    while (match(str, /[[:alpha:]]\/ /)) {
        before = substr(str, 0, RSTART)
        after = substr(str, RSTART+2)
        str = before " /" after
    }

    return str
}

function get_arr(infile, regexp, lines,     i) {
    FS = "\t"
    i = 0
    while (getline < infile) {
        if (match($1, regexp) && $2 == "s") {
            lines[i] = $3
            i++
        }
    }
    close(infile)
    return
}

function get_author(infile, base,     name, ref, retval) {
    retval = ""
    name = select(infile, "^" base "\\.name$", QUERY_STR)
    if (length(name) > 0) {
        retval = name
    } else {
        author_id = select(infile, "^" base "\\.@id$", QUERY_STR)
        if (length(author_id) > 0) {
            ref = select(infile, author_id, QUERY_AUTHOR_REF)
            ref = escape_regex(ref)
            retval = select(infile, "^" ref "\\.name$", QUERY_STR)
        }
   }
   return retval
}

function get_section_paths(infile, regexp, lines,     i) {
    FS = "\t"
    i = 0
    while (getline < infile) {
        if (match($1, regexp) && $2 == "s" && $3 == "HowToSection") {
            lines[i] = escape_regex(substr($1, 0, RLENGTH-6))
            i++
        }
    }
    close(infile)
    return
}

function get_step_paths(infile, regexp, lines,     i) {
    FS = "\t"
    i = 0
    while (getline < infile) {
        if (match($1, regexp) && $2 == "s" && $3 == "HowToStep") {
            lines[i] = escape_regex(substr($1, 0, RLENGTH-6))
            i++
        }
    }
    close(infile)
    return
}

function print_ingredients(ingredients, outfile,     i, n) {
    n = array_size(ingredients)
    for (i = 0; i < n; i++) {
        line = ingredients[i]
        if (line == "[" || line == "]") {
            continue
        }
        line = trim_item(line)
        line = format_units(line, 1)
        line = capitalize_ingredient(line)
        print line >>outfile
    }
    return
}

function print_innertext(inscript, text, outfile,     result) {
    while (length(text) > 0) {
        if (!inscript) {
            result = match(text, /<script [^>]*type="application\/ld[+]json"[^>]*>/)
            if (result) {
                 inscript = 1
                 text = substr(text, RSTART+RLENGTH)
            } else {
                 text = ""
            }
        }
        if (inscript) {
            result = match(text, /<\/script>/)
            if (result) {
                inscript = 0
                innertext = substr(text, 1, RSTART-1)
                text = substr(text, RSTART+RLENGTH)
                print innertext >>outfile
                print "," >>outfile
            } else {
                print text >>outfile
                text = ""
            }
        }
    }
    return inscript
}

function print_instructions(instructions, outfile,    i, n) {
    n = array_size(instructions)
    for (i = 0; i < n; i++) {
        line = instructions[i]
        if (line == "[" || line == "]") {
            continue
        }
        line = trim_item(line)
        line = format_units(line, 0)
        print_wrap(line, wraplen, outfile)
        print "" >>outfile
    }
    return
}

function print_sections(sections, section_names, section_numsteps, outfile,
     i, j, n, m)
{
    n = array_size(section_names)
    for (i = 0; i < n; i++) {
        name = section_names[i]
        if (length(name) > 0) {
            print name ":\n" >>outfile
        }
        m = section_numsteps[i]
        for (j = 0; j < m; j++) {
            line = sections[i,j]
            line = trim_item(line)
            line = format_units(line, 0)
            print_wrap(line, wraplen, outfile)
            print "" >>outfile
        }
    }
    return
}

function print_wrap(str, len, outfile) {
    buf = str
    if (wraplen > 0) {
        while (length(buf) > len) {
            chunk = substr(buf, 0, len)
            if (match(chunk, / [^ ]*$/)) {
                before = substr(buf, 0, RSTART)
                after = substr(buf, RSTART+1)
                print before >>outfile
                buf = after
           } else {
                break
           }
        }
    }
    print buf >>outfile
    return
}

function select(infile, base, key,    count, regexp, retval) {
    count = 0
    retval = ""
    FS = "\t"
    while (getline < infile) {
        if (key == QUERY_AUTHOR_REF) {
            if (match($1, /\.@id$/) && $2 == "s" && $3 == base) {
                retval = substr($1, 0, RSTART-1)
                break
            }
        } else if (key == QUERY_AUTHOR_TYPE) {
            if (match($1, "^" base "\\.author$")) {
                retval = $2
                break
            }
        } else if (key == QUERY_AUTHORS_LEN) {
            if (match($1, "^" base "\\.author\\[[0-9]+]\\.name") && $2 == "s") {
                count++
            }
        } else if (key == QUERY_RECIPE_BASE) {
            if (match($1, /\.@type/) && $2 == "s" && $3 == "Recipe") {
                retval = substr($1, 0, RSTART-1)
                break
            }
        } else if (key == QUERY_SCALAR_LAST) {
            if (match($1, base) && ($2 == "s" || $2 == "n")) {
                retval = $3
            }
        } else if (key == QUERY_SECTIONS_LEN) {
            if (match($1, /\.@type$/) && $2 == "s" && $3 == "HowToSection") {
                count++
            }
        } else if (key == QUERY_STR) {
            if (match($1, base) && $2 == "s") {
                retval = $3
                break
            }
        }
    }
    close(infile)
    if (key == QUERY_AUTHORS_LEN ||
        key == QUERY_SECTIONS_LEN)
    {
        retval = count
    }
    return retval
}


function trim(str,     retval) {
    retval = str
    gsub(/^[[:space:]]+/, "", retval)
    gsub(/[[:space:]]+$/, "", retval)
    return retval
}

function trim_item(str,     retval) {
     retval = trim(str)
     gsub(/^"/, "", retval)
     gsub(/,$/, "", retval)
     gsub(/"$/, "", retval)
     return retval
}

BEGIN {
    wraplen_dos    = 80
    wraplen_usenet = 72
    wraplen_gopher = 70
    wraplen_none   = 0

    # hard wrap line length for recipe paragraphs, choose your poison
    wraplen = wraplen_gopher

    QUERY_AUTHOR_REF   = 1
    QUERY_AUTHOR_TYPE  = 2
    QUERY_AUTHORS_LEN  = 3
    QUERY_RECIPE_BASE  = 4
    QUERY_SCALAR_LAST  = 5
    QUERY_SECTIONS_LEN = 6
    QUERY_STR          = 7

    if (ARGC < 3) {
        print "Usage: json2mm.awk in.htm out"
        print ""
        print "Version 11"
        print ""
        print "Reads in.htm"
        print "Creates out.js and out.txt"
        exit 1
    }
    infile = ARGV[1]
    outbase = ARGV[2]

    if (!file_exists(infile)) {
        print "Error: Could not find file: " infile
        exit 1
    }

    jsfile = outbase ".js"
    extract_js(infile, jsfile)

    tsvfile = outbase ".tsv"
    export_tsv(jsfile, tsvfile)

    txtfile = outbase ".txt"
    export_txt(tsvfile, txtfile)

    exit 0
}