ttwokenize.py - counterfacto - small software tool to analyze twitter and highlight counterfactual statements
 (HTM) git clone git://parazyd.org/counterfacto.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       ttwokenize.py (13000B)
       ---
            1 # -*- coding: utf-8 -*-
            2 """
            3 Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
            4 This tokenizer code has gone through a long history:
            5 
            6 (1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
            7        TweetMotif: Exploratory Search and Topic Summarization for Twitter.
            8        Brendan O'Connor, Michel Krieger, and David Ahn.
            9        ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
           10 (2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
           11 (2b) Jason Baldridge and David Snyder ported it to Scala
           12 (3) Brendan bugfixed the Scala port and merged with POS-specific changes
           13     for the CMU ARK Twitter POS Tagger  
           14 (4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
           15 
           16 Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
           17 
           18 There have been at least 2 other Java ports, but they are not in the lineage for the code here.
           19 
           20 Ported to Python by Myle Ott <myleott@gmail.com>.
           21 """
           22 
           23 from __future__ import print_function
           24 
           25 import operator
           26 import re
           27 import HTMLParser
           28 
           29 def regex_or(*items):
           30     return '(?:' + '|'.join(items) + ')'
           31 
           32 Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
           33 Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
           34 
           35 punctChars = r"['\"“”‘’.?!…,:;]"
           36 #punctSeq   = punctChars+"+"    #'anthem'. => ' anthem '.
           37 punctSeq   = r"['\"“”‘’]+|[.?!,…]+|[:;]+"   #'anthem'. => ' anthem ' .
           38 entity     = r"&(?:amp|lt|gt|quot);"
           39 #  URLs
           40 
           41 
           42 # BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
           43 # If you actually empirically test it the results are bad.
           44 # Please see https://github.com/brendano/ark-tweet-nlp/pull/9
           45 
           46 urlStart1  = r"(?:https?://|\bwww\.)"
           47 commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
           48 ccTLDs   = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
           49 r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
           50 r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
           51 r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
           52 r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
           53 r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
           54 r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
           55 r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"   #TODO: remove obscure country domains?
           56 urlStart2  = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
           57 urlBody    = r"(?:[^\.\s<>][^\s<>]*?)?"
           58 urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
           59 urlEnd     = r"(?:\.\.+|[<>]|\s|$)"
           60 url        = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
           61 
           62 
           63 # Numeric
           64 timeLike   = r"\d+(?::\d+){1,2}"
           65 #numNum     = r"\d+\.\d+"
           66 numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
           67 numComb  = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?".encode('utf-8')
           68 
           69 # Abbreviations
           70 boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
           71 aa1  = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
           72 aa2  = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
           73 standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
           74 arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
           75 separators  = "(?:--+|―|—|~|–|=)"
           76 decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)".encode('utf-8')
           77 thingsThatSplitWords = r"[^\s\.,?\"]"
           78 embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"
           79 
           80 #  Emoticons
           81 # myleott: in Python the (?iu) flags affect the whole expression
           82 #normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
           83 normalEyes = "[:=]" # 8 and x are eyes but cause problems
           84 wink = "[;]"
           85 noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
           86 happyMouths = r"[D\)\]\}]+"
           87 sadMouths = r"[\(\[\{]+"
           88 tongue = "[pPd3]+"
           89 otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
           90 
           91 # mouth repetition examples:
           92 # @aliciakeys Put it in a love song :-))
           93 # @hellocalyclops =))=))=)) Oh well
           94 
           95 # myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
           96 #bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
           97 bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
           98 bfCenter = r"(?:[\.]|[_-]+)"
           99 bfRight = r"\2"
          100 s3 = r"(?:--['\"])"
          101 s4 = r"(?:<|&lt;|>|&gt;)[\._-]+(?:<|&lt;|>|&gt;)"
          102 s5 = "(?:[.][_]+[.])"
          103 # myleott: in Python the (?i) flag affects the whole expression
          104 #basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
          105 basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
          106 
          107 eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+"
          108 eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8')
          109 eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
          110 eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
          111 
          112 oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
          113 
          114 
          115 emoticon = regex_or(
          116         # Standard version  :) :( :] :D :P
          117         "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
          118 
          119         # reversed version (: D:  use positive lookbehind to remove "(word):"
          120         # because eyes on the right side is more ambiguous with the standard usage of : ;
          121         regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",
          122 
          123         #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
          124         eastEmote.replace("2", "1", 1), basicface,
          125         # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]  
          126         # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
          127 
          128         # myleott: o.O and O.o are two of the biggest sources of differences
          129         #          between this and the Java version. One little hack won't hurt...
          130         oOEmote
          131 )
          132 
          133 Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
          134 
          135 Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
          136 
          137 # BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
          138 # "hello (#hashtag)" ==> "hello (#hashtag )"  WRONG
          139 # "hello (#hashtag)" ==> "hello ( #hashtag )"  RIGHT
          140 # "hello (@person)" ==> "hello (@person )"  WRONG
          141 # "hello (@person)" ==> "hello ( @person )"  RIGHT
          142 # ... Some sort of weird interaction with edgepunct I guess, because edgepunct 
          143 # has poor content-symbol detection.
          144 
          145 # This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
          146 # If you want good hashtag identification, use a different regex.
          147 Hashtag = "#[a-zA-Z0-9_]+"  #optional: lookbehind for \b
          148 #optional: lookbehind for \b, max length 15
          149 AtMention = "[@@][a-zA-Z0-9_]+"
          150 
          151 # I was worried this would conflict with at-mentions
          152 # but seems ok in sample of 5800: 7 changes all email fixes
          153 # http://www.regular-expressions.info/email.html
          154 Bound = r"(?:\W|^|$)"
          155 Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"
          156 
          157 # We will be tokenizing using these regexps as delimiters
          158 # Additionally, these things are "protected", meaning they shouldn't be further split themselves.
          159 Protected  = re.compile(
          160     unicode(regex_or(
          161         Hearts,
          162         url,
          163         Email,
          164         timeLike,
          165         #numNum,
          166         numberWithCommas,
          167         numComb,
          168         emoticon,
          169         Arrows,
          170         entity,
          171         punctSeq,
          172         arbitraryAbbrev,
          173         separators,
          174         decorations,
          175         embeddedApostrophe,
          176         Hashtag,  
          177         AtMention
          178     ).decode('utf-8')), re.UNICODE)
          179 
          180 # Edge punctuation
          181 # Want: 'foo' => ' foo '
          182 # While also:   don't => don't
          183 # the first is considered "edge punctuation".
          184 # the second is word-internal punctuation -- don't want to mess with it.
          185 # BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.  
          186 # I remember it causing lots of trouble in the past as well.  Would be good to revisit or eliminate.
          187 
          188 # Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
          189 #edgePunctChars    = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
          190 edgePunctChars    = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
          191 edgePunct    = "[" + edgePunctChars + "]"
          192 notEdgePunct = "[a-zA-Z0-9]" # content characters
          193 offEdge = r"(^|$|:|;|\s|\.|,)"  # colon here gets "(hello):" ==> "( hello ):"
          194 EdgePunctLeft  = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
          195 EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
          196 
          197 def splitEdgePunct(input):
          198     input = EdgePunctLeft.sub(r"\1\2 \3", input)
          199     input = EdgePunctRight.sub(r"\1 \2\3", input)
          200     return input
          201 
          202 # The main work of tokenizing a tweet.
          203 def simpleTokenize(text):
          204 
          205     # Do the no-brainers first
          206     splitPunctText = splitEdgePunct(text)
          207 
          208     textLength = len(splitPunctText)
          209     
          210     # BTO: the logic here got quite convoluted via the Scala porting detour
          211     # It would be good to switch back to a nice simple procedural style like in the Python version
          212     # ... Scala is such a pain.  Never again.
          213 
          214     # Find the matches for subsequences that should be protected,
          215     # e.g. URLs, 1.0, U.N.K.L.E., 12:53
          216     bads = []
          217     badSpans = []
          218     for match in Protected.finditer(splitPunctText):
          219         # The spans of the "bads" should not be split.
          220         if (match.start() != match.end()): #unnecessary?
          221             bads.append( [splitPunctText[match.start():match.end()]] )
          222             badSpans.append( (match.start(), match.end()) )
          223 
          224     # Create a list of indices to create the "goods", which can be
          225     # split. We are taking "bad" spans like 
          226     #     List((2,5), (8,10)) 
          227     # to create 
          228     #     List(0, 2, 5, 8, 10, 12)
          229     # where, e.g., "12" here would be the textLength
          230     # has an even length and no indices are the same
          231     indices = [0]
          232     for (first, second) in badSpans:
          233         indices.append(first)
          234         indices.append(second)
          235     indices.append(textLength)
          236 
          237     # Group the indices and map them to their respective portion of the string
          238     splitGoods = []
          239     for i in range(0, len(indices), 2):
          240         goodstr = splitPunctText[indices[i]:indices[i+1]]
          241         splitstr = goodstr.strip().split(" ")
          242         splitGoods.append(splitstr)
          243 
          244     #  Reinterpolate the 'good' and 'bad' Lists, ensuring that
          245     #  additonal tokens from last good item get included
          246     zippedStr = []
          247     for i in range(len(bads)):
          248         zippedStr = addAllnonempty(zippedStr, splitGoods[i])
          249         zippedStr = addAllnonempty(zippedStr, bads[i])
          250     zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])
          251 
          252     # BTO: our POS tagger wants "ur" and "you're" to both be one token.
          253     # Uncomment to get "you 're"
          254     #splitStr = []
          255     #for tok in zippedStr:
          256     #    splitStr.extend(splitToken(tok))
          257     #zippedStr = splitStr
          258     
          259     return zippedStr
          260 
          261 def addAllnonempty(master, smaller):
          262     for s in smaller:
          263         strim = s.strip()
          264         if (len(strim) > 0):
          265             master.append(strim)
          266     return master
          267 
          268 # "foo   bar " => "foo bar"
          269 def squeezeWhitespace(input):
          270     return Whitespace.sub(" ", input).strip()
          271 
          272 # Final pass tokenization based on special patterns
          273 def splitToken(token):
          274     m = Contractions.search(token)
          275     if m:
          276         return [m.group(1), m.group(2)]
          277     return [token]
          278 
          279 # Assume 'text' has no HTML escaping.
          280 def tokenize(text):
          281     return simpleTokenize(squeezeWhitespace(text))
          282 
          283 
          284 # Twitter text comes HTML-escaped, so unescape it.
          285 # We also first unescape &amp;'s, in case the text has been buggily double-escaped.
          286 def normalizeTextForTagger(text):
          287     text = text.replace("&amp;", "&")
          288     text = HTMLParser.HTMLParser().unescape(text)
          289     return text
          290 
          291 # This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
          292 # 
          293 # This function normalizes the input text BEFORE calling the tokenizer.
          294 # So the tokens you get back may not exactly correspond to
          295 # substrings of the original text.
          296 def tokenizeRawTweetText(text):
          297     tokens = tokenize(normalizeTextForTagger(text))
          298     return tokens