diff --git a/scrape.py b/scrape.py index 2e140b1..de9ad70 100644 --- a/scrape.py +++ b/scrape.py @@ -116,22 +116,6 @@ async def main(): } } } - next_batch_state = await matrix[mxid].room_get_state_event(room_id=admin_room, - event_type=event_type_prefix+'next_batch', - state_key=mxid) - if 'token' in next_batch_state.content: - try: - sync = await matrix[mxid].sync(timeout=3000, - sync_filter=sync_filter, - since=next_batch_state.content['token']) - next_batch[mxid] = sync.next_batch - - continue - except: - pass - # when there is no next_batch token or first sync threw an error, - # then do a first sync without next_batch - print('doing first sync for '+mxid) sync = await matrix[mxid].sync(timeout=3000, sync_filter=sync_filter) next_batch[mxid] = sync.next_batch @@ -139,51 +123,66 @@ async def main(): next_update = datetime.now() while True: if next_update < datetime.now(): + # refresh url cache + cache_state = await matrix[next(iter(matrix))].room_get_state_event(room_id=admin_room, + event_type=event_type_prefix+'cache', + state_key='') + if hasattr(cache_state, 'content') and 'url_list' in cache_state.content: + cache = cache_state.content['url_list'] + else: + cache = [] + + # scape all blog posts and process them blog = get_blog() for post in blog: - mxid = '@'+mxid_prefix+post['game']+':'+homeserver_name - if mxid in matrix: - content = { - 'msgtype': 'm.notice', - 'body': get_body(post), - 'format': 'org.matrix.custom.html', - 'formatted_body': get_formatted_body(post) - } - for room_id in matrix[mxid].rooms: - await matrix[mxid].room_send(room_id=room_id, - message_type='m.room.message', - content=content) + if post['url'] not in cache: + # post url not found in cache + mxid = '@'+mxid_prefix+post['game']+':'+homeserver_name + if mxid in matrix: + # announce new post to matrix rooms + content = { + 'msgtype': 'm.notice', + 'body': get_body(post), + 'format': 'org.matrix.custom.html', + 'formatted_body': get_formatted_body(post) + } + for room_id in matrix[mxid].rooms: + await matrix[mxid].room_send(room_id=room_id, + message_type='m.room.message', + content=content) + else: + # no accesstoken for the calculated mxid + content = { + 'msgtype': 'm.notice', + 'body': 'No accesstoken for '+mxid+' available.', + 'format': 'org.matrix.custom.html', + 'formatted_body': ('No accesstoken for '+ + ''+mxid+' available.') + } + # send the message with the first available matrix client, + # because we will always have at least one accesstoken + await matrix[next(iter(matrix))].room_send(room_id=admin_room, + message_type='m.room.message', + content=content) + + # add url to cache + cache += [post['url']] + while len(cache) > len(blog): + cache.remove(cache[0]) + set_state = await matrix[next(iter(matrix))].room_put_state(room_id=admin_room, + event_type=event_type_prefix+'cache', + content={'url_list': cache}) else: - content = { - 'msgtype': 'm.notice', - 'body': 'No accesstoken for '+mxid+' available.', - 'format': 'org.matrix.custom.html', - 'formatted_body': ('No accesstoken for '+ - ''+mxid+' available.') - } - # send the message with the first available matrix client, - # because we will always have at least one accesstoken - await matrix[next(iter(matrix))].room_send(room_id=admin_room, - message_type='m.room.message', - content=content) - + # no new posts found + pass next_update = datetime.now() + timedelta(minutes=30) - next_batch_state = await matrix[mxid].room_get_state_event(room_id=admin_room, - event_type=event_type_prefix+'next_batch', - state_key=mxid) - for mxid in next_batch: sync = await matrix[mxid].sync(timeout=10000, sync_filter=sync_filter, since=next_batch[mxid]) next_batch[mxid] = sync.next_batch - await matrix[mxid].room_put_state(room_id=admin_room, - event_type=event_type_prefix+'next_batch', - state_key=mxid, - content={'token': next_batch[mxid]}) - homeserver_name = environ['HOMESERVER_NAME'] homeserver_url = environ['HOMESERVER_URL']