# Convert web page with JSON-LD Recipe data to Meal-Master format. # Version 14 # # Usage: awk -f jsontomm.awk infile.htm outbase # Creates outbase.js and outbase.txt # # Hints: # # To fetch a recipe, use curl: # curl --insecure -o web.htm URL # # On DOS, use GAWK.EXE. # # This script depends on json2tsv being in your path. # # Use MMBUSTER.EXE to convert outbase.txt to outbase.mmf for import into # Meal-Master. # # JSON-LD spec: # https://developers.google.com/search/docs/appearance/structured-data/recipe function array_size(a, retval) { retval = 0 for (i in a) { retval++ } return retval } function capitalize_ingredient(line, i, retval) { retval = line result = match(line, /^ *[[:digit:][:space:]\/.]+ +((bn|c|cl|ds|g|kg|l|lb|lg|md|ml|oz|pn|qt|sm|tb|ts) +)*/) if (result) { before = substr(line, 0, RLENGTH) c = substr(line, RLENGTH+1, 1) after = substr(line, RLENGTH+2) retval = before toupper(c) after } else { c = substr(line, 0, 1) after = substr(line, 2) retval = toupper(c) after } return retval } function chop(str, retval) { retval = substr(str, 0, length(str) - 1) return retval } function escape_regex(str) { gsub(/\[/, "\\[", str) gsub(/\./, "\\.", str) return str } function export_tsv(infile, tsvfile, result) { cmd = "json2tsv -n < " infile " > " tsvfile result = system(cmd) if (result != 0) { printf "Error: json2tsv error code: %d\n", result exit 1 } return } function export_txt(infile, outfile, i, j, key, name) { # JSON-LD Recipe format can be a simple object, or an array of objects, # or a graph, or a combination of all of these. Just traverse the # whole tree looking for an object where @type either is "Recipe" or # is an array containing "Recipe". base = select(infile, "", QUERY_RECIPE_BASE) if (length(base) == 0) { print "Error: No JSON-LD Recipe object found" exit 1 } base = escape_regex(base) # if no title, fail early. There might be no JSON-LD data. title = select(infile, "^" base "\\.name$", QUERY_STR) if (length(title) == 0) { print "Error: Could not find recipe title" exit 1 } gsub(/^Recipe: /, "", title) title = format_title_case(title) # author can be a string, or an object, or an array of objects, # and an object may directly represent an author, # or an object may indirectly reference another document via URI/UUID author_type = select(infile, base, QUERY_AUTHOR_TYPE) author = "" if (author_type == "s") { author = select(infile, "^" base "\\.author$", QUERY_STR) } else if (author_type == "o") { author = get_author(infile, base "\\.author") } else if (author_type == "a") { author = "" authors_len = select(infile, base, QUERY_AUTHORS_LEN) for (i = 0; i < authors_len; i++) { key = base "\\.author\\[" i "]" name = get_author(infile, key) if (length(author) == 0) { author = name } else { author = author "," name } } } descr = select(infile, "^" base "\\.description", QUERY_STR) descr = format_units(descr, 0) cook_time = select(infile, "^" base "\\.cookTime", QUERY_STR) cook_time = format_iso8601_dur(cook_time) prep_time = select(infile, "^" base "\\.prepTime", QUERY_STR) if (prep_time == "PT0M") { prep_time = "" } prep_time = format_iso8601_dur(prep_time) delete categories get_arr(infile, "^" base "\\.recipeCategory", categories) category = format_list(categories) delete cuisines get_arr(infile, "^" base "\\.recipeCuisine", cuisines) cuisine = format_list(cuisines) if (length(category) == 0) { category = cuisine } else { if (length(cuisine) > 0) { category = category "," cuisine } } category = format_units(category, 0) # ingredients is an array of strings # unlike Meal-Master, ingredients cannot have sections delete ingredients get_arr(infile, "^" base "\\.recipeIngredient", ingredients) # instructions is an array of strings, HowToSections, and/or HowToSteps # get instructions strings delete instructions get_arr(infile, "^" base ".recipeInstructions(\\[[0-9]+])?$", instructions) # get instructions HowToSections delete section_names delete section_numsteps delete section_paths delete sections key = "^" base "\\.recipeInstructions(\\[[0-9]+])?\\.@type$" get_section_paths(infile, key, section_paths) section_count = array_size(section_paths) for (i = 0; i < section_count; i++) { name = select(infile, "^" section_paths[i] "\\.name$", QUERY_STR) section_names[i] = trim_item(name) delete steps get_arr(infile, "^" section_paths[i] ".*\\.text$", steps) len = array_size(steps) section_numsteps[i] = len for (j = 0; j < len; j++) { sections[i,j] = steps[j] } } # get instructions HowToSteps delete steps_paths key = "^" base "\\.recipeInstructions\\[[0-9]+]\\.@type" get_step_paths(infile, key, steps_paths) len = array_size(steps_paths) if (len > 0) { i = section_count section_count++ section_names[i] = "" section_numsteps[i] = len for (j = 0; j < len; j++) { name = select(infile, "^" steps_paths[j] "\\.name", QUERY_STR) step = select(infile, "^" steps_paths[j] "\\.text", QUERY_STR) if (length(name) > 0 && name != step && !match(name, /\.\.\.$/)) { step = name " " step } sections[i,j] = step } } # yield can be a scalar, or it can be an array of scalars yield = select(infile, "^" base "\\.recipeYield", QUERY_SCALAR_LAST) yield = trim_item(yield) gsub(/people/, "Servings", yield) gsub(/serving(s)/, "Servings", yield) if (match(yield, /^Serves /)) { yield = substr(yield, RSTART+RLENGTH) " Servings" } if (yield ~ /^[0-9]+$/) { yield = yield " Servings" } print "[[[[[" >outfile print title >>outfile if (length(category) > 0) { printf "C: %s\n", category >>outfile } if (length(author) > 0) { printf "B: %s\n", author >>outfile } if (length(yield) > 0) { printf "Y: %s\n", yield >>outfile } print "" >>outfile print_ingredients(ingredients, outfile) print "" >>outfile if (length(prep_time) > 0) { printf "Preparation time: %s\n", prep_time >>outfile } if (length(cook_time) > 0) { printf "Cooking time: %s\n", cook_time >>outfile } print "" >>outfile print_wrap(descr, wraplen, outfile) print "" >>outfile print_instructions(instructions, outfile) print_sections(sections, section_names, section_numsteps, outfile) print "]]]]]" >>outfile close(outfile) return } function extract_js(infile, outfile) { inscript = 0 printf "[" >outfile while (getline >outfile print "," >>outfile } print "{\"caboose\":\"ding ding\"}" >>outfile print "]" >>outfile close(infile) close(outfile) return } function file_exists(name, retval) { retval = 0 if ((getline < name) > 0) { retval = 1 } close(name) return retval } function format_iso8601_dur(str, retval) { retval = str if (str ~ /^PT/) { retval = substr(retval, 3) gsub(/H/, " hours ", retval) gsub(/M/, " minutes ", retval) } return retval } function format_list(array, i, retval) { len = array_size(array) retval = "" for (i = 0; i < len; i++) { line = array[i] if (line == "[" || line == "]") { continue } line = trim_item(line) if (length(retval) == 0) { retval = line } else { retval = retval "," line } } return retval } function format_title_case(str, i, retval) { retval = "" n = split(str, fields, / /) for (i = 1; i <= n; i++) { word = fields[i] if (length(word) == 1) { word = toupper(word) } else { first = substr(word, 0, 1) last = substr(word, 2) word = toupper(first) last } if (length(retval) == 0) { retval = word } else { retval = retval " " word } } return retval } function format_units(str, in_ingredients, i, n) { # convert funny HTML characters to ASCII gsub(/'/, "'", str) gsub(/�?39;/, "'", str) gsub(/’/, "'", str) gsub(/×/, "x", str) gsub(/&nbsp;/, " ", str) gsub(/ /, " ", str) gsub(/è/, "è", str) gsub(/î/, "î", str) gsub(/&/, "\\&", str) gsub(/"/, "\"", str) gsub(/̶[01];/, "\"", str) # convert escaped characters to ASCII gsub(/\\n/, "\n", str) gsub(/\\r/, "", str) # convert funny Unicode characters to ASCII gsub(/^½/, "1/2", str) gsub(/^⅓/, "1/3", str) gsub(/^¼/, "1/4", str) gsub(/^⅛/, "1/8", str) gsub(/^⅔/, "2/3", str) gsub(/^¾/, "3/4", str) gsub(/ ½/, " 1/2", str) gsub(/ ⅓/, " 1/3", str) gsub(/ ¼/, " 1/4", str) gsub(/ ⅛/, " 1/8", str) gsub(/ ⅔/, " 2/3", str) gsub(/ ¾/, " 3/4", str) gsub(/½/, "-1/2", str) gsub(/⅓/, "-1/3", str) gsub(/¼/, "-1/4", str) gsub(/⅛/, "-1/8", str) gsub(/⅔/, "-2/3", str) gsub(/¾/, "-3/4", str) gsub(/º/, "°", str) gsub(/‘/, "'", str) gsub(/’/, "'", str) gsub(/“/, "\"", str) gsub(/”/, "\"", str) gsub(/…/, "...", str) gsub(/–/, "-", str) gsub(/—/, "--", str) gsub(/⁄/, "/", str) # replace in (inch abbreviation) with " symbol while (match(str, /[[:digit:]]in /)) { before = substr(str, 0, RSTART) after = substr(str, RSTART+3) str = before "\"" after } # insert space between digit and letter # or between ingredient and reference # it is important to do this step AFTER converting Unicode # characters, because some of the conversions change the # character class. while (match(str, /[[:digit:]][[:alpha:]]/) || match(str, /[^[:space:]][*]/)) { before = substr(str, 0, RSTART) after = substr(str, RSTART+1) str = before " " after } # convert to standard unit abbreviations gsub(/ T /, " tb ", str) gsub(/ t /, " ts ", str) gsub(/ [Tt]ablespoons? /, " tb ", str) gsub(/ [Tt]bsps? /, " tb ", str) gsub(/ [Tt]easpoons? /, " ts ", str) gsub(/ [Tt]sps? /, " ts ", str) if (in_ingredients) { gsub(/^ +/, "", str) gsub(/ +/, " ", str) gsub(/ bunches /, " bn ", str) gsub(/ bunch /, " bn ", str) gsub(/ cups? /, " c ", str) gsub(/ C /, " c ", str) gsub(/ gms? /, " g ", str) gsub(/ large /, " lg ", str) gsub(/ lb\. /, " lb ", str) gsub(/ medium /, " md ", str) gsub(/ ounces? /, " oz ", str) gsub(/ pinch /, " pn ", str) gsub(/^pinch /, "1 pn ", str) gsub(/\npinch /, "\n1 pn ", str) gsub(/ ounces/, " oz", str) gsub(/ ounce/, " oz", str) gsub(/-ounce/, " oz", str) gsub(/ pounds? /, " lb ", str) gsub(/ quarts? /, " qt ", str) gsub(/ small /, " sm ", str) gsub(/,/, ";", str) gsub(/\.00 /, " ", str) gsub(/^0\.75 /, "3/4 ", str) gsub(/^0\.6[67] /, "2/3 ", str) gsub(/^0\.50? /, "1/2 ", str) gsub(/^0\.33 /, "1/3 ", str) gsub(/^0\.25 /, "1/4 ", str) gsub(/^0\.1[23] /, "1/8 ", str) # change 1 gram flour to 1 g flour, but leave 1 c gram flour as is if (match(str, /gram [Ff]lour/)) { if (match(str, /[[:digit:]][[:space:]]+gram [Ff]lour/)) { gsub(/ gram /, " g ", str) } } else { gsub(/ grams? /, " g ", str) } } # convert 1/3 C to 1/3 c, but leave 300 C as is if (match(str, /[^[:digit:]][[:digit:]] C /)) { before = substr(str, 0, RSTART+1) after = substr(str, RSTART+5) str = before " c " after } # insert space between unit of measure and slash and next quantity while (match(str, /[[:alpha:]]\/[0-9]/)) { before = substr(str, 0, RSTART) after = substr(str, RSTART+2) str = before " / " after } while (match(str, /[[:alpha:]]\/ /)) { before = substr(str, 0, RSTART) after = substr(str, RSTART+2) str = before " /" after } return str } function get_arr(infile, regexp, lines, i) { FS = "\t" i = 0 while (getline < infile) { if (match($1, regexp) && $2 == "s") { lines[i] = $3 i++ } } close(infile) return } function get_author(infile, base, name, ref, retval) { retval = "" name = select(infile, "^" base "\\.name$", QUERY_STR) if (length(name) > 0) { retval = name } else { author_id = select(infile, "^" base "\\.@id$", QUERY_STR) if (length(author_id) > 0) { ref = select(infile, author_id, QUERY_AUTHOR_REF) ref = escape_regex(ref) retval = select(infile, "^" ref "\\.name$", QUERY_STR) } } return retval } function get_section_paths(infile, regexp, lines, i) { FS = "\t" i = 0 while (getline < infile) { if (match($1, regexp) && $2 == "s" && $3 == "HowToSection") { lines[i] = escape_regex(substr($1, 0, RLENGTH-6)) i++ } } close(infile) return } function get_step_paths(infile, regexp, lines, i) { FS = "\t" i = 0 while (getline < infile) { if (match($1, regexp) && $2 == "s" && $3 == "HowToStep") { lines[i] = escape_regex(substr($1, 0, RLENGTH-6)) i++ } } close(infile) return } function print_ingredients(ingredients, outfile, i, n) { n = array_size(ingredients) for (i = 0; i < n; i++) { line = ingredients[i] if (line == "[" || line == "]") { continue } line = trim_item(line) line = format_units(line, 1) line = capitalize_ingredient(line) print line >>outfile } return } function print_innertext(inscript, text, outfile, result) { while (length(text) > 0) { if (!inscript) { result = match(text, /