tcounterfacto - counterfacto - small software tool to analyze twitter and highlight counterfactual statements
 (HTM) git clone git://parazyd.org/counterfacto.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       tcounterfacto (8746B)
       ---
            1 #!/usr/bin/env python2
            2 # Counterfacto is Copyright (c) 2016 by the Dyne.org Foundation
            3 # as part of the PIEnews project
            4 #
            5 # This file is part of Counterfacto
            6 # Written by Ivan J. <parazyd@dyne.org>
            7 #
            8 # This source code is free software; you can redistribute it and/or
            9 # modify it under the terms of the GNU Public License as published by
           10 # the Free Software Foundation; either version 3 of the License, or
           11 # (at your option) any later version.
           12 #
           13 # This source code is distributed in the hope that it will be useful,
           14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
           15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  Please refer
           16 # to the GNU Public License for more details.
           17 #
           18 # You should have received a copy of the GNU Public License along with
           19 # this source code; if not, write to: Free Software Foundation, Inc.,
           20 # 675 Mass Ave, Cambridge, MA 02139, USA.
           21 #
           22 # This project has received funding from the European Union's Horizon
           23 # 2020 Programme for research, technological development and
           24 # demonstration under grant agreement nr. 687922
           25 
           26 import nltk
           27 from nltk.tag.perceptron import PerceptronTagger
           28 import re
           29 import sys
           30 from twitter import *
           31 import twokenize
           32 
           33 global tweetfile
           34 global taggedFile
           35 taggedFile = 'tagged.txt'
           36 
           37 try:
           38     with open('credentials') as fd:
           39         exec(fd.read())
           40 except:
           41     print('no credentials file found. please create it.')
           42     exit(1)
           43 
           44 def main():
           45     ## credential check json
           46     #print(api.VerifyCredentials())
           47 
           48     try:
           49         if sys.argv[1] == '-f':
           50             tweetfile = sys.argv[2]
           51             classify(tweetfile)
           52 
           53         elif sys.argv[1] == '-a':
           54             api = Twitter(auth=OAuth(oatoken,oasecret,conskey,conssecret))
           55             accountname = sys.argv[2]
           56             statuses = api.statuses.user_timeline(screen_name=accountname,
           57                                                   count=100)
           58 
           59             tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt'
           60             tweetFile = open(tweetfile, 'w')
           61 
           62             for s in statuses:
           63                 sintweet = s['text']
           64                 sintweet = sintweet.replace('\n', ' ')
           65                 sintweet = sintweet.encode('ascii', 'ignore')
           66                 tweetFile.write(sintweet + '\n')
           67                 #print('wrote tweet')
           68 
           69             tweetFile.close()
           70             classify(tweetfile)
           71 
           72         elif sys.argv[1] == '-s':
           73             api = twitter.Api(consumer_key=twit_consumer_key,
           74                   consumer_secret=twit_consumer_secret,
           75                   access_token_key=twit_access_key,
           76                   access_token_secret=twit_access_secret)
           77 
           78             if len(sys.argv) >= 3:
           79                 searchterm = ' '.join(sys.argv[2:])
           80             else:
           81                 searchterm = sys.argv[2]
           82 
           83             statuses = api.GetSearch(term=searchterm,
           84                                      count=100)
           85                                      #result_type="recent")
           86 
           87             #for s in statuses:
           88             #    print(s.text)
           89             #exit()
           90 
           91             tweetfile = 'fetchedtweets-' + sys.argv[2] + '.txt'
           92             tweetFile = open(tweetfile, 'w')
           93 
           94             for s in statuses:
           95                 sintweet = s.text
           96                 sintweet = sintweet.replace('\n', ' ')
           97                 sintweet = sintweet.encode('ascii', 'ignore')
           98                 tweetFile.write(sintweet + '\n')
           99                 #print('wrote tweet')
          100 
          101             tweetFile.close()
          102             classify(tweetfile)
          103 
          104     except:
          105         print("usage: " + sys.argv[0] + " [-a account] [-f tweetfile] [-s searchterm]")
          106         exit(1)
          107 
          108 ## {{{ processing functions
          109 def tokenizelocal():
          110     tweets = tweetFile.read().splitlines()
          111     for t in tweets:
          112         print(t + '\n')
          113         print(str(twokenize.tokenize(t)) + '\n')
          114 
          115 def format_tweet(message):
          116     m = str(message)
          117     m = m.replace('\n', ' ')
          118     m = m.encode('ascii', 'ignore')
          119     return m
          120 
          121 def format_tagged(tagged_list):
          122     out = ''
          123     for t in tagged_list:
          124         token, tag = postprocess_tag(t[0], t[1])
          125         out = out + token + '/' + tag + '/'
          126     out = out + '\n'
          127     return out
          128 
          129 def postprocess_tag(token, tag):
          130     outtag = tag
          131     if (is_twitter_cf_modal(token)):
          132         outtag = 'MD'
          133     elif (tag_CCJ(token)):
          134         outtag = 'CCJ'
          135     return token, outtag
          136 
          137 def get_cf_form(tagged_message):
          138 
          139     # Filter out questions
          140     pq = re.compile('\.*/\?/.', re.IGNORECASE)
          141     if pq.search(tagged_message) != None:
          142         return 0
          143 
          144     # CASE 1 WISH VERB FORM
          145     p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE)
          146     if p1.search(tagged_message) != None:
          147         return 1
          148 
          149 
          150     # CASE 2 CONJUNTION NORMAL
          151     p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE)
          152     if p2.search(tagged_message) != None:
          153         return 2
          154 
          155 
          156     # CASE 3 CONJUNCTIVE CONVERSE
          157     p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE)
          158     if p3.search(tagged_message) != None:
          159         return 3
          160 
          161 
          162     # CASE 5 Should have
          163     p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE)
          164     if p4.search(tagged_message) != None:
          165         return 4
          166 
          167     # CASE 6 VERB INVERSION
          168     p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)"
          169                     "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)"
          170                     "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE)
          171     if p5.search(tagged_message) != None:
          172         return 5
          173 
          174 
          175     # CASE 6 MODAL NORMAL
          176     p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE)
          177     if p6.search(tagged_message) != None:
          178         return 6
          179 
          180     # If no matches
          181     return 0
          182 
          183 
          184 
          185 def is_twitter_cf_modal(word):
          186     w = unicode(word, errors='ignore').encode('utf-8').lower()
          187     if (w == 'should' or
          188         w == 'should\'ve' or
          189         w == 'shouldve' or
          190         w == 'shoulda' or
          191         w == 'shulda' or
          192         w == 'shuda' or
          193         w == 'shudda' or
          194         w == 'shudve' or
          195         w == 'would' or
          196         w == 'would\'ve' or
          197         w == 'wouldve' or
          198         w == 'woulda' or
          199         w == 'wuda' or
          200         w == 'wulda' or
          201         w == 'wudda' or
          202         w == 'wudve' or
          203         w == 'wlda' or
          204         w == 'could' or
          205         w == 'could\'ve' or
          206         w == 'couldve' or
          207         w == 'coulda' or
          208         w == 'cudda' or
          209         w == 'culda' or
          210         w == 'cudve' or
          211         w == 'must' or
          212         w == 'mustve' or
          213         w == 'might' or
          214         w == 'might\'ve' or
          215         w == 'mightve' or
          216         w == 'ought' or
          217         w == 'may' or
          218         w == 'i\'d' or
          219         w == 'id' or
          220         w == 'we\'d' or
          221         w == 'youd' or
          222         w == 'you\'d' or
          223         w == 'he\'d' or
          224         w == 'she\'d'):
          225             return True
          226     return False
          227 
          228 def tag_CCJ(word):
          229     w = word.lower()
          230     '''
          231     as long as, even if, if, one condition that, provided (that),
          232     providing (that), so long as, unless, whether... or, supposing,
          233     suppose, imagine, but for
          234     '''
          235     if(w == 'as' or
          236         w == 'if' or
          237         w == 'even' or
          238         w == 'provided' or
          239         w == 'providing' or
          240         w == 'suppose' or
          241         w == 'supposing' or
          242         w == 'unless' or
          243         w == 'whether' or
          244         w == 'envision' or
          245         w == 'envisioning' or
          246         w == 'conceptualize'or
          247         w == 'conceptualizing' or
          248         w == 'conjure' or
          249         w == 'conjuring' or
          250         w == 'visualize' or
          251         w == 'visualizing'):
          252         return True
          253     return False
          254 
          255 def get_tagged_message(message, tagger):
          256     tagset = None
          257     formatted_message = format_tweet(message)
          258     tokens = twokenize.tokenize(formatted_message)
          259     tags = nltk.tag._pos_tag(tokens, tagset, tagger)
          260     return format_tagged(tags)
          261 ## }}}
          262 
          263 def classify(tweetfile):
          264     tweetFile = open(tweetfile, 'r')
          265     tagFile = open(taggedFile, 'w')
          266 
          267     tagger = PerceptronTagger()
          268     form_num = 7
          269 
          270     cf_count = [[0 for x in range(form_num)] for x in range(form_num)]
          271 
          272     form_vec = []
          273 
          274     print("Reading file...")
          275     tweet = tweetFile.readline()
          276 
          277     while tweet != '':
          278         taggedTweet = get_tagged_message(tweet, tagger)
          279         tagFile.write(taggedTweet)
          280         #print("did tweet")
          281         form = int(get_cf_form(taggedTweet))
          282 
          283         ## if our tweet is positive, print it
          284         if form != 0:
          285             print(tweet)
          286 
          287         form_vec.append(form)
          288 
          289         cf_count[form][0] = cf_count[form][0] + 1
          290 
          291         tweet = tweetFile.readline()
          292 
          293     count = 0
          294     for i in xrange(1,form_num):
          295         count = count + cf_count[i][0]
          296 
          297 
          298     print("finished tagging...")
          299     tweetFile.close()
          300     tagFile.close()
          301 
          302     print("counterfactuals: " + str(count) + "/100")
          303 
          304 main()