tfactolib.py - counterfacto - small software tool to analyze twitter and highlight counterfactual statements
 (HTM) git clone git://parazyd.org/counterfacto.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       tfactolib.py (5455B)
       ---
            1 #!/usr/bin/env python2
            2 # Copyright (c) 2017 Ivan J. <parazyd@dyne.org
            3 
            4 import nltk
            5 import re
            6 import twokenize
            7 from nltk.tag.perceptron import PerceptronTagger
            8 
            9 def tokenizelocal():
           10     tweets = tweetFile.read().splitlines()
           11     for t in tweets:
           12         print(t + '\n')
           13         print(str(twokenize.tokenize(t)) + '\n')
           14 
           15 def format_tweet(message):
           16     m = str(message)
           17     m = m.replace('\n', ' ')
           18     m = m.encode('ascii', 'ignore')
           19     return m
           20 
           21 def format_tagged(tagged_list):
           22     out = ''
           23     for t in tagged_list:
           24         token, tag = postprocess_tag(t[0], t[1])
           25         out = out + token + '/' + tag + '/'
           26     out = out + '\n'
           27     return out
           28 
           29 def postprocess_tag(token, tag):
           30     outtag = tag
           31     if (is_twitter_cf_modal(token)):
           32         outtag = 'MD'
           33     elif (tag_CCJ(token)):
           34         outtag = 'CCJ'
           35     return token, outtag
           36 
           37 def get_cf_form(tagged_message):
           38 
           39     # Filter out questions
           40     pq = re.compile('\.*/\?/.', re.IGNORECASE)
           41     if pq.search(tagged_message) != None:
           42         return 0
           43 
           44     # CASE 1 WISH VERB FORM
           45     p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE)
           46     if p1.search(tagged_message) != None:
           47         return 1
           48 
           49 
           50     # CASE 2 CONJUNTION NORMAL
           51     p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE)
           52     if p2.search(tagged_message) != None:
           53         return 2
           54 
           55 
           56     # CASE 3 CONJUNCTIVE CONVERSE
           57     p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE)
           58     if p3.search(tagged_message) != None:
           59         return 3
           60 
           61 
           62     # CASE 5 Should have
           63     p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE)
           64     if p4.search(tagged_message) != None:
           65         return 4
           66 
           67     # CASE 6 VERB INVERSION
           68     p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)"
           69                     "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)"
           70                     "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE)
           71     if p5.search(tagged_message) != None:
           72         return 5
           73 
           74 
           75     # CASE 6 MODAL NORMAL
           76     p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE)
           77     if p6.search(tagged_message) != None:
           78         return 6
           79 
           80     # If no matches
           81     return 0
           82 
           83 
           84 def is_twitter_cf_modal(word):
           85     w = unicode(word, errors='ignore').encode('utf-8').lower()
           86     if (w == 'should' or
           87         w == 'should\'ve' or
           88         w == 'shouldve' or
           89         w == 'shoulda' or
           90         w == 'shulda' or
           91         w == 'shuda' or
           92         w == 'shudda' or
           93         w == 'shudve' or
           94         w == 'would' or
           95         w == 'would\'ve' or
           96         w == 'wouldve' or
           97         w == 'woulda' or
           98         w == 'wuda' or
           99         w == 'wulda' or
          100         w == 'wudda' or
          101         w == 'wudve' or
          102         w == 'wlda' or
          103         w == 'could' or
          104         w == 'could\'ve' or
          105         w == 'couldve' or
          106         w == 'coulda' or
          107         w == 'cudda' or
          108         w == 'culda' or
          109         w == 'cudve' or
          110         w == 'must' or
          111         w == 'mustve' or
          112         w == 'might' or
          113         w == 'might\'ve' or
          114         w == 'mightve' or
          115         w == 'ought' or
          116         w == 'may' or
          117         w == 'i\'d' or
          118         w == 'id' or
          119         w == 'we\'d' or
          120         w == 'youd' or
          121         w == 'you\'d' or
          122         w == 'he\'d' or
          123         w == 'she\'d'):
          124             return True
          125     return False
          126 
          127 def tag_CCJ(word):
          128     w = word.lower()
          129     '''
          130     as long as, even if, if, one condition that, provided (that),
          131     providing (that), so long as, unless, whether... or, supposing,
          132     suppose, imagine, but for
          133     '''
          134     if(w == 'as' or
          135         w == 'if' or
          136         w == 'even' or
          137         w == 'provided' or
          138         w == 'providing' or
          139         w == 'suppose' or
          140         w == 'supposing' or
          141         w == 'unless' or
          142         w == 'whether' or
          143         w == 'envision' or
          144         w == 'envisioning' or
          145         w == 'conceptualize'or
          146         w == 'conceptualizing' or
          147         w == 'conjure' or
          148         w == 'conjuring' or
          149         w == 'visualize' or
          150         w == 'visualizing'):
          151         return True
          152     return False
          153 
          154 def get_tagged_message(message, tagger):
          155     tagset = None
          156     formatted_message = format_tweet(message)
          157     tokens = twokenize.tokenize(formatted_message)
          158     tags = nltk.tag._pos_tag(tokens, tagset, tagger)
          159     return format_tagged(tags)
          160 
          161 def classify(tweetfile, taggedfile):
          162     tweetfile  = open(tweetfile,  "r")
          163     taggedfile = open(taggedfile, "w")
          164     counterfactuals = open('counterfactuals.txt', 'w')
          165 
          166     tagger = PerceptronTagger()
          167     form_num = 8
          168 
          169     cf_count = [[0 for x in range(form_num)] for x in range(form_num)]
          170 
          171     form_vec = []
          172 
          173     print("Reading file...")
          174     tweet = tweetfile.readline()
          175 
          176     while tweet:
          177         taggedTweet = get_tagged_message(tweet, tagger)
          178         taggedfile.write(taggedTweet)
          179         form = int(get_cf_form(taggedTweet))
          180 
          181         if form:
          182             print(tweet)
          183             counterfactuals.write(tweet + '<hr>\n')
          184 
          185         form_vec.append(form)
          186         cf_count[form][0] += 1
          187         tweet = tweetfile.readline()
          188 
          189     count = 0
          190     for i in xrange(1, form_num):
          191         count += cf_count[i][0]
          192 
          193     print("Finished tagging...")
          194     tweetfile.close()
          195     taggedfile.close()
          196 
          197     print("counterfactuals: " + str(count) + "/100")
          198     counterfactuals.write("counterfactuals: " + str(count) + "/100<br>\n")
          199     counterfactuals.close()