diff --git a/README.md b/README.md index d3389fd..9387d97 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ docker run --rm \ -e MIXD=@heeeroooooooes:example.org -e ACCESSTOKEN_FILE=/heroesofthestorm \ -e ADMIN_ROOM='!iesofojasief90429ewiofj:example.org' \ - -e CATEGORY=heroesofthestorm + -e CATEGORY=heroesofthestorm,insideblizzard,battlenet gitea.lubiland.de/lub/snowstorm-matrix:latest ``` @@ -52,7 +52,7 @@ Or via docker-compose/swarm: - ACCESSTOKEN_FILE=/run/secrets/snowstorm-matrix_overwatch - MXID=@bastionrulez:example.com - ADMIN_ROOM=!jjpPluoxZoAOBQeYer:example.org - - CATEGORY=overwatch + - CATEGORY=overwatch,overwatch2 snowstorm-matrix_worldofwarcraft: image: gitea.lubiland.de/lub/snowstorm-matrix:latest deploy: diff --git a/scrape.py b/scrape.py index 468b701..5dba0f9 100644 --- a/scrape.py +++ b/scrape.py @@ -125,6 +125,7 @@ async def main(): # use this event type to store our url cache cache_event_type = 'de.lubiland.snowstorm-matrix.cache' + cache = {} while True: # do sync first to e.g. accept an admin room invite sync = await matrix.sync(timeout=30000, sync_filter=sync_filter) @@ -136,20 +137,22 @@ async def main(): if next_update < datetime.now(): # refresh url cache - cache_state = await matrix.room_get_state_event(room_id=admin_room, - event_type=cache_event_type, - state_key=category) - if hasattr(cache_state, 'content') and 'url_list' in cache_state.content: - cache = cache_state.content['url_list'] - else: - print('cache is empty') - cache = [] + old_cache = cache + cache = {} + for category in category_list: + cache_state = await matrix.room_get_state_event(room_id=admin_room, + event_type=cache_event_type, + state_key=category) + if hasattr(cache_state, 'content') and 'url_list' in cache_state.content: + if not hasattr(cache, category): + cache[category] = [] + cache[category] += cache_state.content['url_list'] # scrape all blog posts and process them blog = get_blog() for post in blog: # check if post url is in cache and matches our category - if post['url'] not in cache and post['category'] == category: + if post['category'] in category_list and hasattr(cache, post['category']) and post['url'] not in cache[post['category']]: # post url not found in cache # announce new post to matrix rooms print('new post: '+post['url']) @@ -169,21 +172,24 @@ async def main(): content=content) # add url to cache - cache += [post['url']] + cache[post['category']] += [post['url']] else: # no new posts found pass - # trim the cache - while len(cache) > 100: - cache.remove(cache[0]) + # cleanup cache and push it as room state + for category in cache.keys(): + # trim the cache + while len(cache[category]) > 100: + cache[category].remove(cache[category][0]) - # set new cache event - await matrix.room_put_state(room_id=admin_room, - event_type=cache_event_type, - state_key=category, - content={'url_list': cache}) + # set new cache event + if hasattr(old_cache, 'category') and old_cache[category] != cache[category]: + await matrix.room_put_state(room_id=admin_room, + event_type=cache_event_type, + state_key=category, + content={'url_list': cache[category]}) # wait between 15min and 30min to randomize scraping next_update = datetime.now() + timedelta(minutes=randrange(15, 30)) @@ -202,8 +208,9 @@ print('accesstoken_file: '+environ['ACCESSTOKEN_FILE']) admin_room = environ['ADMIN_ROOM'] print('admin_room: '+admin_room) -category = environ['CATEGORY'] -print('category: '+category) +category_list = environ['CATEGORY'].split(',') +print('categories:') +print(category_list) asyncio.run(main())