import re import requests import lxml.html from urllib.parse import urlparse DOMAIN_UNKNOWN = -1 class DomainIdentifier: """ Makeshift-automated enum class to allow runtime expansion of an enum. Create an instance to get a new unique domain identifier that can resolve to an int. TODO find better way :D """ __domain_list = [] def __init__(self, name: str): self.__name = name self.__domain_number = len(DomainIdentifier.__domain_list) if DomainIdentifier.get_identifier(name) is not None: raise Exception("The domain \"{0}\" is already in the list! Tried to create a duplicate!".format(name)) DomainIdentifier.__domain_list.append(self) @property def name(self) -> str: return self.__name @property def identifier(self) -> int: return self.__domain_number @staticmethod def get_identifier(name: str) -> "DomainIdentifier": for d in DomainIdentifier.__domain_list: if d.name == name: return d return None class DomainLinkGenerator: def __init__(self, domain: DomainIdentifier): self.__identifier = domain def match_file_name(self, file_name) -> bool: """ Checks if a given file name is plausible to be used by the given domain :param file_name: :return: """ raise NotImplementedError def get_domain_name(self) -> str: return self.__identifier.name def get_identifier(self) -> int: """ Return the Identifier for this Predictors domain. :return: """ return self.__identifier.identifier def construct_link(self, file_name: str) -> str: """ Construct a link by inserting the file_name into the known link pattern :param file_name: :return: """ raise NotImplementedError def scrape_tags(self, url: str, headers: dict, file_name: str) -> list: """ Scrape the tags from the given url for all tags associated with the work. The file_name can also be used to check the given url against prediction results. :param url: :param headers: :param file_name: :return: """ raise NotImplementedError class LinkGenerator: """ Predict and generate valid links to the file by matching the given file names against known patterns of the origin domain. """ __instance = None # Singleton holder def __init__(self): self.__link_generators = [] import ArtNet.web.domains # implements return_all_domains() which returns instances of all domains # return_all_domains() is to return a list of all DomainLinkGenerator instances that are to be used for p in ArtNet.web.domains.return_all_domains(): self.register_domain_predictor(p) @staticmethod def get_instance() -> "LinkGenerator": """ Gets the current instance :return: """ if LinkGenerator.__instance is None: LinkGenerator.__instance = LinkGenerator() return LinkGenerator.__instance def register_domain_predictor(self, predictor: DomainLinkGenerator): """ Register another DomainValidator to be used by this LinkPredictor :param predictor: :param domain: int identifier for the domain :return: """ if predictor not in self.__link_generators: self.__link_generators.append(predictor) def predict_domain(self, file_name: str) -> int: """ Predict the possible domains of the given file by guessing via the filename :param file_name: :return: """ for g in self.__link_generators: try: if g.match_file_name(file_name): # TODO stop accepting first match return g.get_identifier() except NotImplementedError: pass return DOMAIN_UNKNOWN def construct_link(self, file_name: str, domain: int) -> str: """ Construct a valid link to access the web page where the file name (hopefully) originated from. :param file_name: :param domain: :return: """ for g in self.__link_generators: if g.get_identifier() == domain: # TODO stop accepting first match try: return g.construct_link(file_name) except NotImplementedError: return None return None def scrape_tags(self, url: str, domain: int, file_name: str) -> dict: """ Scrapes the tags from the given url :param url: :param domain: :param file_name: :return: """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:76.0) Gecko/20100101 Firefox/76.0" } url_domain = urlparse(url).netloc for g in self.__link_generators: if g.get_identifier() == domain or g.get_domain_name() == url_domain: try: return g.scrape_tags(url=url, headers=headers, file_name=file_name) except NotImplementedError: pass return None