gopher.r-36.net

       Simplify parsing and unescape text entries. - zs - Zeitungsschau rss to email converter
       
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit 9891ca73640aa4fa074c54e92913f847ba1e756b
 (DIR) parent 9e95a0f332a1bfabfba59c9bad6460e70731db9f
 (HTM) Author: Christoph Lohmann <20h@r-36.net>
       Date:   Wed, 11 Nov 2015 22:08:35 +0100
       
       Simplify parsing and unescape text entries.
       
       Diffstat:
         zeitungsschau/feed.py               |      62 +++++++++++++++----------------
       
       1 file changed, 29 insertions(+), 33 deletions(-)
       ---
 (DIR) diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py
       @@ -13,6 +13,7 @@ import requests
        import hashlib
        import pytz
        import codecs
       +import html
        
        def parseiso(dstr, now):
                try:
       @@ -32,18 +33,9 @@ def removenamespaces(xml):
                                        elem.tag = elem.tag[nsl:]
        
        def parsexml(astr):
       -        try:
       -                xml = objectify.fromstring(astr)
       -                removenamespaces(xml)
       -        except etree.XMLSyntaxError:
       -                try:
       -                        parser = etree.HTMLParser()
       -                        xml = objectify.fromstring(astr, parser)
       -                        removenamespaces(xml)
       -                except etree.XMLSyntaxError:
       -                        parser = etree.XMLParser(resolve_entities=False)
       -                        xml = objectify.fromstring(astr, parser)
       -                        removenamespaces(xml)
       +        xml = objectify.fromstring(astr)
       +        removenamespaces(xml)
       +        # Throw XML parsing errors so we can blame the feed authors.
                return xml
        
        def parse(astr):
       @@ -57,10 +49,6 @@ def parse(astr):
                isrdf = False
                now = datetime.now(pytz.utc)
        
       -        feede = xml.xpath(".//feed")
       -        if len(feede) > 0:
       -                xml = feede[0]
       -
                if hasattr(xml, "channel"):
                        if hasattr(xml, "item"):
                                isrdf = True
       @@ -71,11 +59,11 @@ def parse(astr):
                feed["title"] = ""
                for e in ("title", "description"):
                        if hasattr(xml, e):
       -                        feed[e] = str(xml[e])
       +                        feed[e] = html.unescape(str(xml[e]))
                
                if hasattr(xml, "image") and hasattr(xml.image, "title"):
                        if "title" not in feed:
       -                        feed["title"] = str(xml.image.title)
       +                        feed["title"] = html.unescape(str(xml.image.title))
        
                if hasattr(xml, "updated"):
                        feed["updated"] = parseiso(xml.updated, now) 
       @@ -93,25 +81,25 @@ def parse(astr):
                                feed["link"] = str(xml.link)
        
                if hasattr(xml, "webmaster"):
       -                feed["email"] = str(xml.webmaster)
       +                feed["email"] = html.unescape(str(xml.webmaster))
                elif hasattr(xml, "owner") and hasattr(xml.owner, "email"):
       -                feed["email"] = str(xml.owner.email)
       +                feed["email"] = html.unescape(str(xml.owner.email))
                elif hasattr(xml, "author") and hasattr(xml.author, "email"):
       -                feed["email"] = str(xml.author.email)
       +                feed["email"] = html.unescape(str(xml.author.email))
                elif hasattr(xml, "webMaster"):
       -                feed["email"] = str(xml.webMaster)
       +                feed["email"] = html.unescape(str(xml.webMaster))
                elif hasattr(xml, "managingeditor"):
       -                feed["email"] = str(xml.managingeditor)
       +                feed["email"] = html.unescape(str(xml.managingeditor))
                elif hasattr(xml, "managingEditor"):
       -                feed["email"] = str(xml.managingEditor)
       +                feed["email"] = html.unescape(str(xml.managingEditor))
        
                if hasattr(xml, "author"):
                        if hasattr(xml.author, "name"):
       -                        feed["author"] = str(xml.author.name)
       +                        feed["author"] = html.unescape(str(xml.author.name))
                        else:
       -                        feed["author"] = str(xml.author)
       +                        feed["author"] = html.unescape(str(xml.author))
                elif hasattr(xml, "creator"):
       -                feed["author"] = str(xml.creator)
       +                feed["author"] = html.unescape(str(xml.creator))
        
                entryname = "entry"
                if isrss == True or isrdf == True:
       @@ -123,7 +111,8 @@ def parse(astr):
                                article = {}
                                # title
                                if hasattr(entry, "title"):
       -                                article["title"] = str(entry["title"])
       +                                article["title"] = html.unescape(\
       +                                                str(entry["title"]))
        
                                # link
                                if hasattr(entry, "link"):
       @@ -149,8 +138,9 @@ def parse(astr):
                                                hasattr(entry.group, "content"):
                                        if "url" in entry.group.content:
                                                article["file"] = \
       +                                                html.unescape(\
                                                        str(entry.group.content.\
       -                                                attrib["file"])
       +                                                attrib["file"]))
        
                                # updated
                                try:
       @@ -171,19 +161,25 @@ def parse(astr):
                                # author
                                if hasattr(entry, "author"):
                                        if hasattr(entry.author, "name"):
       -                                        article["author"] = str(entry.author.name)
       +                                        article["author"] = html.unescape(\
       +                                                        str(entry.author.name))
                                        else:
       -                                        article["author"] = str(entry.author)
       +                                        article["author"] = html.unescape(\
       +                                                        str(entry.author))
                                elif hasattr(entry, "creator"):
       -                                article["author"] = str(entry.creator)
       +                                article["author"] = html.unescape(\
       +                                                str(entry.creator))
        
                                # tags
                                if hasattr(entry, "category"):
                                        article["tags"] = []
                                        for cat in entry["category"][:]:
       -                                        article["tags"].append(str(cat))
       +                                        article["tags"].append(\
       +                                                        html.unescape(\
       +                                                        str(cat)))
        
                                # text
       +                        # Don't unescape the text, it might contain HTML.
                                if hasattr(entry, "encoded"):
                                        article["text"] = str(entry.encoded)
                                elif hasattr(entry, "content"):