You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

174 lines
5.2 KiB
Python

import re
import requests
import lxml.html
from urllib.parse import urlparse
DOMAIN_UNKNOWN = -1
class DomainIdentifier:
"""
Makeshift-automated enum class to allow runtime expansion of an enum.
Create an instance to get a new unique domain identifier that can resolve to an int.
TODO find better way :D
"""
__domain_list = []
def __init__(self, name: str):
self.__name = name
self.__domain_number = len(DomainIdentifier.__domain_list)
if DomainIdentifier.get_identifier(name) is not None:
raise Exception("The domain \"{0}\" is already in the list! Tried to create a duplicate!".format(name))
DomainIdentifier.__domain_list.append(self)
@property
def name(self) -> str:
return self.__name
@property
def identifier(self) -> int:
return self.__domain_number
@staticmethod
def get_identifier(name: str) -> "DomainIdentifier":
for d in DomainIdentifier.__domain_list:
if d.name == name:
return d
return None
class DomainLinkGenerator:
def __init__(self, domain: DomainIdentifier):
self.__identifier = domain
def match_file_name(self, file_name) -> bool:
"""
Checks if a given file name is plausible to be used by the given domain
:param file_name:
:return:
"""
raise NotImplementedError
def get_domain_name(self) -> str:
return self.__identifier.name
def get_identifier(self) -> int:
"""
Return the Identifier for this Predictors domain.
:return:
"""
return self.__identifier.identifier
def construct_link(self, file_name: str) -> str:
"""
Construct a link by inserting the file_name into the known link pattern
:param file_name:
:return:
"""
raise NotImplementedError
def scrape_tags(self, url: str, headers: dict, file_name: str) -> list:
"""
Scrape the tags from the given url for all tags associated with the work.
The file_name can also be used to check the given url against prediction results.
:param url:
:param headers:
:param file_name:
:return:
"""
raise NotImplementedError
class LinkGenerator:
"""
Predict and generate valid links to the file
by matching the given file names against known patterns of the origin domain.
"""
__instance = None # Singleton holder
def __init__(self):
self.__link_generators = []
import ArtNet.web.domains # implements return_all_domains() which returns instances of all domains
# return_all_domains() is to return a list of all DomainLinkGenerator instances that are to be used
for p in ArtNet.web.domains.return_all_domains():
self.register_domain_predictor(p)
@staticmethod
def get_instance() -> "LinkGenerator":
"""
Gets the current instance
:return:
"""
if LinkGenerator.__instance is None:
LinkGenerator.__instance = LinkGenerator()
return LinkGenerator.__instance
def register_domain_predictor(self, predictor: DomainLinkGenerator):
"""
Register another DomainValidator to be used by this LinkPredictor
:param predictor:
:param domain: int identifier for the domain
:return:
"""
if predictor not in self.__link_generators:
self.__link_generators.append(predictor)
def predict_domain(self, file_name: str) -> int:
"""
Predict the possible domains of the given file by guessing via the filename
:param file_name:
:return:
"""
for g in self.__link_generators:
try:
if g.match_file_name(file_name): # TODO stop accepting first match
return g.get_identifier()
except NotImplementedError:
pass
return DOMAIN_UNKNOWN
def construct_link(self, file_name: str, domain: int) -> str:
"""
Construct a valid link to access the web page where the file name (hopefully) originated from.
:param file_name:
:param domain:
:return:
"""
for g in self.__link_generators:
if g.get_identifier() == domain: # TODO stop accepting first match
try:
return g.construct_link(file_name)
except NotImplementedError:
return None
return None
def scrape_tags(self, url: str, domain: int, file_name: str) -> dict:
"""
Scrapes the tags from the given url
:param url:
:param domain:
:param file_name:
:return:
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:76.0) Gecko/20100101 Firefox/76.0"
}
url_domain = urlparse(url).netloc
for g in self.__link_generators:
if g.get_identifier() == domain or g.get_domain_name() == url_domain:
try:
return g.scrape_tags(url=url, headers=headers, file_name=file_name)
except NotImplementedError:
pass
return None