bitreich.org

       Add example selenium script for the atom hackathon. - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository
 (HTM) git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
       ---
 (DIR) commit a7cd0c547c792f74b7784cc0a8c806380a28ca2f
 (DIR) parent 2922c09dc4919dcea4ac331bbaa4e373ba4ccc4a
 (HTM) Author: Christoph Lohmann <20h@r-36.net>
       Date:   Thu, 10 Aug 2023 16:10:01 +0200
       
       Add example selenium script for the atom hackathon.
       
       Diffstat:
         A sfeed-atom/kvssachsen2atom          |     121 +++++++++++++++++++++++++++++++
       
       1 file changed, 121 insertions(+), 0 deletions(-)
       ---
 (DIR) diff --git a/sfeed-atom/kvssachsen2atom b/sfeed-atom/kvssachsen2atom
       @@ -0,0 +1,121 @@
       +#!/usr/bin/env python
       +# coding=utf-8
       +#
       +# Copy me if you can.
       +# by 20h
       +#
       +
       +import os
       +import sys
       +import getopt
       +
       +from selenium import webdriver
       +from selenium.webdriver.chrome.options import Options as chromeoptions
       +from selenium.webdriver.support.ui import WebDriverWait
       +from selenium.webdriver.support import expected_conditions as EC
       +from selenium.webdriver.common.by import By
       +
       +from datetime import datetime
       +import pytz
       +
       +def usage(app):
       +        app = os.path.basename(app)
       +        sys.stderr.write("usage: %s [-h] URI\n" % (app))
       +        sys.exit(1)
       +
       +def main(args):
       +        try:
       +                opts, largs = getopt.getopt(args[1:], "h")
       +        except getopt.GetoptError as err:
       +                print(str(err))
       +                usage(args[0])
       +        
       +        for o, a in opts:
       +                if o == "-h":
       +                        usage(args[0])
       +                else:
       +                        assert False, "unhandled option"
       +
       +        if len(largs) < 1:
       +                usage(args[0])
       +
       +        link = largs[0]
       +
       +        options = chromeoptions()
       +        chromearguments = [
       +                "headless",
       +                "no-sandbox",
       +                "disable-extensions",
       +                "disable-dev-shm-usage",
       +                "start-maximized",
       +                "window-size=1900,1080",
       +                "disable-gpu"
       +        ]
       +        for carg in chromearguments:
       +                options.add_argument(carg)
       +
       +        driver = webdriver.Chrome(options=options)
       +        driver.get(link)
       +
       +        isnews = WebDriverWait(driver=driver, timeout=60).until(
       +                        EC.presence_of_element_located((By.XPATH,
       +                                "//div[@data-last-letter]")
       +                        )
       +        )
       +        newslist = driver.find_elements(By.XPATH, "//div[@data-filter-target=\"list\"]")[0]
       +
       +        title = driver.find_elements(By.XPATH, "//meta[@property=\"og:title\"]")[0].get_attribute("content")
       +        description = title
       +        globaltags = ""
       +
       +        print("""<?xml version="1.0" encoding="utf-8"?>""")
       +        print("""<feed xmlns="http://www.w3.org/2005/Atom">""")
       +        print("\t<title><![CDATA[%s]]></title>" % (title))
       +        print("\t<subtitle><![CDATA[%s]]></subtitle>" % (description))
       +        print("\t<id>%s</id>" % (link))
       +        print("\t<link href=\"%s\" rel=\"self\" />" % (link))
       +        print("\t<link href=\"%s\" />" % (link))
       +
       +        utcnow = datetime.now(pytz.utc)
       +        print("\t<updated>%s</updated>" % (utcnow.isoformat()))
       +
       +        articles = newslist.find_elements(By.XPATH, "./div")
       +        baselink = "/".join(link.split("/", 3)[:-1])
       +        for article in articles[::-1]:
       +                link = article.find_elements(By.XPATH, "./a")[0]
       +                plink = link.get_attribute("href")
       +                if not plink.startswith("http"):
       +                        plink = "%s/%s" % (baselink, plink)
       +                ptitle = link.get_attribute("data-title")
       +                pcontent = article.text
       +                pauthor = "sachsen@kvsachsen.de"
       +
       +                # Normalize datetime.
       +                updateds = article.find_elements(By.XPATH, ".//time")[0].text
       +                try:
       +                        dtupdated = datetime.strptime(updateds, "%d.%m.%Y")
       +                except ValueError:
       +                        continue
       +
       +                dtupdated = dtupdated.replace(hour=12, minute=0,\
       +                                second=0, tzinfo=pytz.utc)
       +                if dtupdated.year > utcnow.year:
       +                        dtupdated = dtupdated.replace(year=utcnow.year)
       +                pupdated = dtupdated
       +
       +                print("\t<entry>")
       +                print("\t\t<id>%s</id>" % (plink))
       +                print("\t\t<title><![CDATA[%s]]></title>" % (ptitle))
       +                print("\t\t<link href=\"%s\" />" % (plink))
       +                print("\t\t<author><name>%s</name></author>" % (pauthor))
       +                print("\t\t<updated>%s</updated>" % (pupdated.isoformat()))
       +                print("\t\t<content><![CDATA[%s]]></content>" % (pcontent))
       +                print("\t</entry>")
       +        
       +        print("</feed>")
       +
       +        return 0
       +
       +if __name__ == "__main__":
       +        sys.exit(main(sys.argv))
       +