From 586dcf15de960abeb72eac917ee3654619a53517 Mon Sep 17 00:00:00 2001 From: lub Date: Wed, 6 May 2020 16:58:30 +0200 Subject: [PATCH] refactor main() into single category bot --- README.md | 25 ++++---- scrape.py | 166 +++++++++++++++++++++++------------------------------- 2 files changed, 81 insertions(+), 110 deletions(-) diff --git a/README.md b/README.md index 1bf5c67..0f4d093 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ Matrix Room: [#snowstorm-matrix:imninja.net](https://matrix.to/#/#snowstorm-matrix:imninja.net) This bot will send news about Blizzard games to Matrix room they got invited to. -It will additionally alert an admin room about missing bots. Information about which URLs already got posted is also saved into the admin room via state events. @@ -31,10 +30,11 @@ After initial configuration you can run it manually: ```bash docker run --rm \ -v $(pwd)/heroesofthestorm:/heroesofthestorm:ro \ - -v $(pwd)/worldofwarcraft:/worldofwarcraft:ro \ - -e ADMIN_ROOM='!iesofojasief90429ewiofj:matrix.org' \ - -e ACCESSTOKEN_HEROESOFTHESTORM_FILE=/heroesofthestorm \ - -e ACCESSTOKEN_WORLDOFWARCRAFT_FILE=/worldofwarcraft \ + -e HOMESERVER=https://example.org + -e MIXD=@heeeroooooooes:example.org + -e ACCESSTOKEN_FILE=/heroesofthestorm \ + -e ADMIN_ROOM='!iesofojasief90429ewiofj:example.org' \ + -e CATEGORY=heroesofthestorm snowstorm-matrix ``` @@ -45,16 +45,11 @@ Or via docker-compose/swarm: deploy: replicas: 1 secrets: - - snowstorm-matrix_heroesofthestorm - - snowstorm-matrix_insideblizzard - - snowstorm-matrix_overwatch - snowstorm-matrix_worldofwarcraft environment: - - HOMESERVER_URL=http://synapse:8008 - - HOMESERVER_NAME=matrix.org - - ADMIN_ROOM=!jjpPluoxZoAOBQeYer:imninja.net - - ACCESSTOKEN_HEROESOFTHESTORM_FILE=/run/secrets/snowstorm-matrix_heroesofthestorm - - ACCESSTOKEN_INSIDEBLIZZARD_FILE=/run/secrets/snowstorm-matrix_insideblizzard - - ACCESSTOKEN_OVERWATCH_FILE=/run/secrets/snowstorm-matrix_overwatch - - ACCESSTOKEN_WORLDOFWARCRAFT_FILE=/run/secrets/snowstorm-matrix_worldofwarcraft + - HOMESERVER=http://synapse:8008 + - ACCESSTOKEN_FILE=/run/secrets/snowstorm-matrix_worldofwarcraft + - MXID=@forthehorde:example.com + - ADMIN_ROOM=!jjpPluoxZoAOBQeYer:example.org + - CATEGORY=worldofwarcraft ``` diff --git a/scrape.py b/scrape.py index 0055df3..5f36073 100644 --- a/scrape.py +++ b/scrape.py @@ -15,12 +15,6 @@ def get_accesstoken_from_file(accesstoken_path): accesstoken_file.close() return single_accesstoken -async def on_event(room, event): - if hasattr(event, 'membership'): - if event.membership == 'invite': - # automatically join invites - print('joining '+room.room_id) - join = await matrix[event.source['state_key']].join(room.room_id) def get_blog(): url = 'https://news.blizzard.com/en-us/' html = requests.get(url).text @@ -40,7 +34,7 @@ def get_blog(): blog.append({ 'image': image_url, - 'game': text_list[0].contents[0].replace(' ', '').replace(':', '').lower(), + 'category': text_list[0].contents[0].replace(' ', '').replace(':', '').lower(), 'title': text_list[1].contents[0], 'description': '', 'url': base_url+feature_html.attrs['href'], @@ -56,7 +50,7 @@ def get_blog(): blog.append({ 'image': image_url, - 'game': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0].replace(' ', '').replace(':', '').lower(), + 'category': content_html.find(class_='ArticleListItem-subtitle').find(class_='ArticleListItem-labelInner').contents[0].replace(' ', '').replace(':', '').lower(), 'title': content_html.find(class_='ArticleListItem-title').contents[0], 'description': content_html.find(class_='ArticleListItem-description').find(class_='h6').contents[0], 'url': base_url+article_html.find(class_='ArticleLink').attrs['href'], @@ -86,50 +80,53 @@ def get_formatted_body(post): return formatted_body async def main(): - event_type_prefix = 'de.lubiland.snowstorm-matrix.' - next_batch = {} - for game in accesstoken: - # initialize new client - mxid = '@'+mxid_prefix+game+':'+homeserver_name - config = ClientConfig(store_sync_tokens=True) - matrix[mxid] = AsyncClient(homeserver_url, - config=config) + # initialize new client + config = ClientConfig(store_sync_tokens=True) + matrix = AsyncClient(homeserver, + config=config) - # login - login_response = LoginResponse(mxid, - 'xxx', - accesstoken[game]) - await matrix[mxid].receive_response(login_response) + # login + login_response = LoginResponse(mxid, + 'xxx', + accesstoken) + await matrix.receive_response(login_response) - matrix[mxid].add_event_callback(on_event, InviteEvent) - # do a first sync - sync_filter = { - 'room': { - 'state': { - 'types': ['m.room.member'], - 'lazy_load_members': True - }, - 'timeline': { - 'types': ['invalid'] - }, - 'ephemeral': { - 'types': ['invalid'] - } + # filter out everything except m.room.member (for invites) + sync_filter = { + 'room': { + 'state': { + 'types': ['m.room.member'], + 'lazy_load_members': True + }, + 'timeline': { + 'types': ['invalid'] + }, + 'ephemeral': { + 'types': ['invalid'] } } - sync = await matrix[mxid].sync(timeout=3000, - sync_filter=sync_filter) - next_batch[mxid] = sync.next_batch + } + # setting this to enforce a scrape at first loop next_update = datetime.now() + + # use this event type to store our url cache + cache_event_type = 'de.lubiland.snowstorm-matrix.cache' + while True: + # do sync first to e.g. accept an admin room invite + sync = await matrix.sync(sync_filter=sync_filter) + for room_id in sync.rooms.invite: + print('joining '+room_id) + await matrix.join(room_id) + if next_update < datetime.now(): # refresh url cache - cache_state = await matrix[next(iter(matrix))].room_get_state_event(room_id=admin_room, - event_type=event_type_prefix+'cache', - state_key='') + cache_state = await matrix.room_get_state_event(room_id=admin_room, + event_type=cache_event_type+'cache', + state_key=category) if hasattr(cache_state, 'content') and 'url_list' in cache_state.content: cache = cache_state.content['url_list'] else: @@ -138,71 +135,50 @@ async def main(): # scape all blog posts and process them blog = get_blog() for post in blog: - if post['url'] not in cache: + # check if post url is in cache and matches our category + if post['url'] not in cache and post['category'] == category: # post url not found in cache - mxid = '@'+mxid_prefix+post['game']+':'+homeserver_name - if mxid in matrix: - # announce new post to matrix rooms - content = { - 'msgtype': 'm.notice', - 'body': get_body(post), - 'format': 'org.matrix.custom.html', - 'formatted_body': get_formatted_body(post) - } - for room_id in matrix[mxid].rooms: - if room_id != admin_room: - # don't send updates to the admin room - await matrix[mxid].room_send(room_id=room_id, - message_type='m.room.message', - content=content) - else: - # no accesstoken for the calculated mxid - content = { - 'msgtype': 'm.notice', - 'body': 'No accesstoken for '+mxid+' available.', - 'format': 'org.matrix.custom.html', - 'formatted_body': ('No accesstoken for '+ - ''+mxid+' available.') - } - # send the message with the first available matrix client, - # because we will always have at least one accesstoken - await matrix[next(iter(matrix))].room_send(room_id=admin_room, - message_type='m.room.message', - content=content) + # announce new post to matrix rooms + content = { + 'msgtype': 'm.notice', + 'body': get_body(post), + 'format': 'org.matrix.custom.html', + 'formatted_body': get_formatted_body(post) + } + for room_id in matrix.rooms: + # don't send updates to the admin room + if room_id != admin_room: + await matrix.room_send(room_id=room_id, + message_type='m.room.message', + content=content) # add url to cache cache += [post['url']] - # check for double the post count, to have some buffer for manually purging URLs - # otherwise the cache could reshuffle when you remove too many URLs at once - while len(cache) > len(blog)*2: - cache.remove(cache[0]) - set_state = await matrix[next(iter(matrix))].room_put_state(room_id=admin_room, - event_type=event_type_prefix+'cache', - content={'url_list': cache}) + else: # no new posts found pass + + # trim the cache + # len(blog) is usually bigger than the count of posts in our category, + # so with len(blog) instead of the latter we have some buffer + while len(cache) > len(blog): + cache.remove(cache[0]) + + # set new cache event + await matrix.room_put_state(room_id=admin_room, + event_type=cache_event_type, + state_key=category, + content={'url_list': cache}) + next_update = datetime.now() + timedelta(minutes=15) - for mxid in next_batch: - sync = await matrix[mxid].sync(timeout=10000, - sync_filter=sync_filter, - since=next_batch[mxid]) - next_batch[mxid] = sync.next_batch - -homeserver_name = environ['HOMESERVER_NAME'] -homeserver_url = environ['HOMESERVER_URL'] -mxid_prefix = environ['MXID_PREFIX'] +homeserver = environ['HOMESERVER'] +mxid = environ['MXID'] +accesstoken = get_accesstoken_from_file(environ['ACCESSTOKEN_FILE']) admin_room = environ['ADMIN_ROOM'] +category = environ['CATEGORY'] -accesstoken = {} -for var in environ: - if (game := re.match('^ACCESSTOKEN_([A-Z]*)_FILE$', var)) is not None: - accesstoken[game[1].lower()] = get_accesstoken_from_file(environ[var]) - - - -matrix = {} asyncio.run(main())