snowstorm-matrix/scrape.py

from os import environ

import requests
import re
from bs4 import BeautifulSoup


def get_accesstoken_from_file(accesstoken_path):
    accesstoken_file = open(accesstoken_path, 'r')
    single_accesstoken = accesstoken_file.read().strip()
    accesstoken_file.close()

    return single_accesstoken
def get_blog():
    url = 'https://news.blizzard.com/en-us/'
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    base_url = 'https://news.blizzard.com'

    blog = []

    feature_list_html = soup.find_all(class_='FeaturedArticle-link')
    for feature_html in feature_list_html:
        image_html = feature_html.find(class_='Card-image')
        image_url_fragment = re.findall('url\("(.*?)"\)', image_html.attrs['style'])[0]
        image_url = 'https:'+image_url_fragment

        text_list = feature_html.find_all(class_='text-truncate-ellipsis')

        blog.append({
            'image': image_url,
            'game': text_list[0].contents[0],
            'title': text_list[1].contents[0],
            'description': '',
            'url': base_url+feature_html.attrs['href'],
        })

    article_list_html = soup.find_all(class_='ArticleListItem')
    for article_html in article_list_html:
        image_html = article_html.find(class_='ArticleListItem-image')
        image_url_fragment = re.findall('url\((.*?)\)', image_html.attrs['style'])[0]
        image_url = 'https:'+image_url_fragment

        content_html = article_html.find(class_='ArticleListItem-contentGrid')

        blog.append({
            'image': image_url,
            'game': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0],
            'title': content_html.find(class_='ArticleListItem-title').contents[0],
            'description': content_html.find(class_='ArticleListItem-description').find(class_='h6').contents[0],
            'url': base_url+article_html.find(class_='ArticleLink').attrs['href'],
        })

    return blog
def get_body(post):
    body = post['title']+"\n"

    if post['description'] != '':
        body += post['description']+"\n"

    body += post['url']

    return body
def get_formatted_body(post):
    formatted_body =  '<a href="'+post['url']+'">'
    formatted_body += '<h5>'+post['title']+'</h5>'
    formatted_body += '</a>'

    if post['description'] != '':
        formatted_body += '<p>'+post['description']+'</p>'

    formatted_body += post['url']

    return formatted_body


homeserver = environ['HOMESERVER_URL']
mxid = environ['MXID_PREFIX']
admin_room = environ['ADMIN_ROOM']

accesstoken = {}
for key in environ:
    if (game := re.match('^ACCESSTOKEN_([A-Z]*)_FILE$', key)) is not None:
        accesstoken[game[1].lower()] = get_accesstoken_from_file(environ[key])


blog = get_blog()
for post in blog:
    print(get_body(post))
    print(get_formatted_body(post))