snowstorm-matrix/scrape.py

from os import environ

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from random import randrange

import asyncio
from nio import ClientConfig, AsyncClient, LoginResponse, InviteEvent


def get_accesstoken_from_file(accesstoken_path):
    accesstoken_file = open(accesstoken_path, 'r')
    single_accesstoken = accesstoken_file.read().strip()
    accesstoken_file.close()

    return single_accesstoken
def extract_image_url(image_html):
    # only recent articles use "" to escape the url, so we have to search for
    # with quotes and without quotes
    image_url_fragment = re.findall(r'url\("?(.*?)"?\)', image_html.attrs['style'])[0]
    return 'https:'+image_url_fragment
def sanitize_category(raw_category):
    return raw_category.replace(' ', '').replace(':', '').lower()
def get_blog():
    url = 'https://news.blizzard.com/en-us/'
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    base_url = 'https://news.blizzard.com'

    blog = []

    for featured_article in soup.select('#featured-articles article'):
        image_url = extract_image_url(featured_article.find(class_='Card-image'))

        text_list = featured_article.select('.text-truncate-ellipsis')
        category = sanitize_category(text_list[0].text)
        title = text_list[1].text

        url = base_url+featured_article.find('a').attrs['href']

        blog.append({
            'image': image_url,
            'category': category,
            'title': title,
            'description': '', # featured articles don't have a description
            'url': url,
        })

    for recent_article in soup.select('#recent-articles article'):
        image_url = extract_image_url(recent_article.find(class_='ArticleListItem-image'))

        category = sanitize_category(recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text)
        title = recent_article.find(class_='ArticleListItem-title').text
        description = recent_article.find(class_='ArticleListItem-description').find(class_='h6').text

        url = base_url+recent_article.find('a').attrs['href']

        blog.append({
            'image': image_url,
            'category': category,
            'title': title,
            'description': description,
            'url': url
        })

    # reverse order so the oldest article is at [0]
    # we want to iterate later from oldest to newest
    blog.reverse()

    return blog
def get_body(post):
    body = post['title']+"\n"

    if post['description']:
        body += post['description']+"\n"

    body += post['url']

    return body
def get_formatted_body(post):
    formatted_body =  '<h5><a href="'+post['url']+'">'
    formatted_body += post['title']
    formatted_body += '</a></h5>'

    if post['description']:
        formatted_body += '<p>'+post['description']+'</p>'

    return formatted_body
async def main():

    # initialize new client
    config = ClientConfig(store_sync_tokens=True)
    matrix = AsyncClient(homeserver,
                         config=config)

    # login
    login_response = LoginResponse(mxid,
                                   'xxx',
                                   accesstoken)
    await matrix.receive_response(login_response)


    # filter out everything except m.room.member (for invites)
    sync_filter = {
        'room': {
            'state': {
                'types': ['m.room.member'],
                'lazy_load_members': True
            },
            'timeline': {
                'types': ['invalid']
            },
            'ephemeral': {
                'types': ['invalid']
            }
        }
    }

    # setting this to enforce a scrape at first loop
    next_update = datetime.now()

    # use this event type to store our url cache
    cache_event_type = 'de.lubiland.snowstorm-matrix.cache'

    while True:
        # do sync first to e.g. accept an admin room invite
        sync = await matrix.sync(timeout=30000, sync_filter=sync_filter)
        print('last sync: '+str(datetime.now()))

        for room_id in sync.rooms.invite:
            print('joining: '+room_id)
            await matrix.join(room_id)

        if next_update < datetime.now():
            # refresh url cache
            cache_state = await matrix.room_get_state_event(room_id=admin_room,
                                                            event_type=cache_event_type,
                                                            state_key=category)
            if hasattr(cache_state, 'content') and 'url_list' in cache_state.content:
                cache = cache_state.content['url_list']
            else:
                print('cache is empty')
                cache = []

            # scrape all blog posts and process them
            blog = get_blog()
            for post in blog:
                # check if post url is in cache and matches our category
                if post['url'] not in cache and post['category'] == category:
                    # post url not found in cache
                    # announce new post to matrix rooms
                    print('new post: '+post['url'])

                    content = {
                        'msgtype': 'm.notice',
                        'body': get_body(post),
                        'format': 'org.matrix.custom.html',
                        'formatted_body': get_formatted_body(post)
                    }
                    for room_id in matrix.rooms:
                        # don't send updates to the admin room
                        if room_id != admin_room:
                            print('to room: '+room_id)
                            await matrix.room_send(room_id=room_id,
                                                   message_type='m.room.message',
                                                   content=content)

                    # add url to cache
                    cache += [post['url']]

                else:
                    # no new posts found
                    pass

            # trim the cache
            while len(cache) > 100:
                cache.remove(cache[0])

            # set new cache event
            await matrix.room_put_state(room_id=admin_room,
                                        event_type=cache_event_type,
                                        state_key=category,
                                        content={'url_list': cache})

            # wait between 15min and 30min to randomize scraping
            next_update = datetime.now() + timedelta(minutes=randrange(15, 30))
            print('next scrape: '+str(next_update))


homeserver = environ['HOMESERVER']
print('homeserver: '+homeserver)

mxid = environ['MXID']
print('homeserver: '+mxid)

accesstoken = get_accesstoken_from_file(environ['ACCESSTOKEN_FILE'])
print('accesstoken_file: '+environ['ACCESSTOKEN_FILE'])

admin_room = environ['ADMIN_ROOM']
print('admin_room: '+admin_room)

category = environ['CATEGORY']
print('category: '+category)


asyncio.run(main())
add intial config 4 years ago			`from os import environ`

add initial scraping and message concat 4 years ago			`import requests`
			`import re`
			`from bs4 import BeautifulSoup`
start spamming messages 4 years ago			`from datetime import datetime, timedelta`
randomize scraping interval 4 years ago			`from random import randrange`
add initial scraping and message concat 4 years ago
auto accept invites 4 years ago			`import asyncio`
			`from nio import ClientConfig, AsyncClient, LoginResponse, InviteEvent`
add intial config 4 years ago

			`def get_accesstoken_from_file(accesstoken_path):`
			`accesstoken_file = open(accesstoken_path, 'r')`
			`single_accesstoken = accesstoken_file.read().strip()`
			`accesstoken_file.close()`

			`return single_accesstoken`
refactor category and image into own functions 4 years ago			`def extract_image_url(image_html):`
			`# only recent articles use "" to escape the url, so we have to search for`
			`# with quotes and without quotes`
			`image_url_fragment = re.findall(r'url\("?(.*?)"?\)', image_html.attrs['style'])[0]`
			`return 'https:'+image_url_fragment`
			`def sanitize_category(raw_category):`
			`return raw_category.replace(' ', '').replace(':', '').lower()`
add initial scraping and message concat 4 years ago			`def get_blog():`
			`url = 'https://news.blizzard.com/en-us/'`
			`html = requests.get(url).text`
			`soup = BeautifulSoup(html, 'html.parser')`

			`base_url = 'https://news.blizzard.com'`

			`blog = []`

fix scraping 4 years ago			`for featured_article in soup.select('#featured-articles article'):`
refactor category and image into own functions 4 years ago			`image_url = extract_image_url(featured_article.find(class_='Card-image'))`
add initial scraping and message concat 4 years ago
fix scraping 4 years ago			`text_list = featured_article.select('.text-truncate-ellipsis')`
refactor category and image into own functions 4 years ago			`category = sanitize_category(text_list[0].text)`
			`title = text_list[1].text`
fix scraping 4 years ago
			`url = base_url+featured_article.find('a').attrs['href']`
add initial scraping and message concat 4 years ago
			`blog.append({`
			`'image': image_url,`
fix scraping 4 years ago			`'category': category,`
			`'title': title,`
			`'description': '', # featured articles don't have a description`
			`'url': url,`
add initial scraping and message concat 4 years ago			`})`

fix scraping 4 years ago			`for recent_article in soup.select('#recent-articles article'):`
refactor category and image into own functions 4 years ago			`image_url = extract_image_url(recent_article.find(class_='ArticleListItem-image'))`
add initial scraping and message concat 4 years ago
refactor category and image into own functions 4 years ago			`category = sanitize_category(recent_article.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').text)`
fix scraping 4 years ago			`title = recent_article.find(class_='ArticleListItem-title').text`
			`description = recent_article.find(class_='ArticleListItem-description').find(class_='h6').text`

			`url = base_url+recent_article.find('a').attrs['href']`
add initial scraping and message concat 4 years ago
			`blog.append({`
			`'image': image_url,`
fix scraping 4 years ago			`'category': category,`
			`'title': title,`
			`'description': description,`
			`'url': url`
add initial scraping and message concat 4 years ago			`})`

reverse order in which scraped posts are returned 4 years ago			`# reverse order so the oldest article is at [0]`
			`# we want to iterate later from oldest to newest`
			`blog.reverse()`

add initial scraping and message concat 4 years ago			`return blog`
			`def get_body(post):`
make description in body optional 4 years ago			`body = post['title']+"\n"`

auto accept invites 4 years ago			`if post['description']:`
make description in body optional 4 years ago			`body += post['description']+"\n"`

			`body += post['url']`

			`return body`
add initial scraping and message concat 4 years ago			`def get_formatted_body(post):`
fix <h5> <a> ordering 4 years ago			`formatted_body = '<h5><a href="'+post['url']+'">'`
			`formatted_body += post['title']`
			`formatted_body += '</a></h5>'`
make description in body optional 4 years ago
auto accept invites 4 years ago			`if post['description']:`
make description in body optional 4 years ago			`formatted_body += '<p>'+post['description']+'</p>'`

			`return formatted_body`
auto accept invites 4 years ago			`async def main():`
refactor main() into single category bot 4 years ago
			`# initialize new client`
			`config = ClientConfig(store_sync_tokens=True)`
			`matrix = AsyncClient(homeserver,`
			`config=config)`

			`# login`
			`login_response = LoginResponse(mxid,`
			`'xxx',`
			`accesstoken)`
			`await matrix.receive_response(login_response)`


			`# filter out everything except m.room.member (for invites)`
			`sync_filter = {`
			`'room': {`
			`'state': {`
			`'types': ['m.room.member'],`
			`'lazy_load_members': True`
			`},`
			`'timeline': {`
			`'types': ['invalid']`
			`},`
			`'ephemeral': {`
			`'types': ['invalid']`
auto accept invites 4 years ago			`}`
			`}`
refactor main() into single category bot 4 years ago			`}`
auto accept invites 4 years ago
refactor main() into single category bot 4 years ago			`# setting this to enforce a scrape at first loop`
start spamming messages 4 years ago			`next_update = datetime.now()`
refactor main() into single category bot 4 years ago
			`# use this event type to store our url cache`
			`cache_event_type = 'de.lubiland.snowstorm-matrix.cache'`

auto accept invites 4 years ago			`while True:`
refactor main() into single category bot 4 years ago			`# do sync first to e.g. accept an admin room invite`
add sync timeout this prevents us spamming the server with sync requests 4 years ago			`sync = await matrix.sync(timeout=30000, sync_filter=sync_filter)`
add some debug output 4 years ago			`print('last sync: '+str(datetime.now()))`

refactor main() into single category bot 4 years ago			`for room_id in sync.rooms.invite:`
add some debug output 4 years ago			`print('joining: '+room_id)`
refactor main() into single category bot 4 years ago			`await matrix.join(room_id)`

start spamming messages 4 years ago			`if next_update < datetime.now():`
add url cache 4 years ago			`# refresh url cache`
refactor main() into single category bot 4 years ago			`cache_state = await matrix.room_get_state_event(room_id=admin_room,`
fix event_type parameter 4 years ago			`event_type=cache_event_type,`
refactor main() into single category bot 4 years ago			`state_key=category)`
add url cache 4 years ago			`if hasattr(cache_state, 'content') and 'url_list' in cache_state.content:`
			`cache = cache_state.content['url_list']`
			`else:`
add some debug output 4 years ago			`print('cache is empty')`
add url cache 4 years ago			`cache = []`

typo in comment 4 years ago			`# scrape all blog posts and process them`
start spamming messages 4 years ago			`blog = get_blog()`
			`for post in blog:`
refactor main() into single category bot 4 years ago			`# check if post url is in cache and matches our category`
			`if post['url'] not in cache and post['category'] == category:`
add url cache 4 years ago			`# post url not found in cache`
refactor main() into single category bot 4 years ago			`# announce new post to matrix rooms`
add some debug output 4 years ago			`print('new post: '+post['url'])`

refactor main() into single category bot 4 years ago			`content = {`
			`'msgtype': 'm.notice',`
			`'body': get_body(post),`
			`'format': 'org.matrix.custom.html',`
			`'formatted_body': get_formatted_body(post)`
			`}`
			`for room_id in matrix.rooms:`
			`# don't send updates to the admin room`
			`if room_id != admin_room:`
add some debug output 4 years ago			`print('to room: '+room_id)`
refactor main() into single category bot 4 years ago			`await matrix.room_send(room_id=room_id,`
			`message_type='m.room.message',`
			`content=content)`
add url cache 4 years ago
			`# add url to cache`
			`cache += [post['url']]`
refactor main() into single category bot 4 years ago
start spamming messages 4 years ago			`else:`
add url cache 4 years ago			`# no new posts found`
			`pass`
auto accept invites 4 years ago
refactor main() into single category bot 4 years ago			`# trim the cache`
use hardcoded cache size instead of len(blog) It seems the blog posts are not rotated strictly chronologically, so sometimes previously already posted things got posted again. 3 years ago			`while len(cache) > 100:`
refactor main() into single category bot 4 years ago			`cache.remove(cache[0])`
auto accept invites 4 years ago
refactor main() into single category bot 4 years ago			`# set new cache event`
			`await matrix.room_put_state(room_id=admin_room,`
			`event_type=cache_event_type,`
			`state_key=category,`
			`content={'url_list': cache})`
add initial scraping and message concat 4 years ago
randomize scraping interval 4 years ago			`# wait between 15min and 30min to randomize scraping`
			`next_update = datetime.now() + timedelta(minutes=randrange(15, 30))`
add some debug output 4 years ago			`print('next scrape: '+str(next_update))`
add intial config 4 years ago
auto accept invites 4 years ago
refactor main() into single category bot 4 years ago			`homeserver = environ['HOMESERVER']`
add some debug output 4 years ago			`print('homeserver: '+homeserver)`

refactor main() into single category bot 4 years ago			`mxid = environ['MXID']`
add some debug output 4 years ago			`print('homeserver: '+mxid)`

refactor main() into single category bot 4 years ago			`accesstoken = get_accesstoken_from_file(environ['ACCESSTOKEN_FILE'])`
add some debug output 4 years ago			`print('accesstoken_file: '+environ['ACCESSTOKEN_FILE'])`

refactor main() into single category bot 4 years ago			`admin_room = environ['ADMIN_ROOM']`
add some debug output 4 years ago			`print('admin_room: '+admin_room)`

refactor main() into single category bot 4 years ago			`category = environ['CATEGORY']`
add some debug output 4 years ago			`print('category: '+category)`
auto accept invites 4 years ago
add intial config 4 years ago
auto accept invites 4 years ago			`asyncio.run(main())`