You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
174 lines
5.2 KiB
Python
174 lines
5.2 KiB
Python
import re
|
|
import requests
|
|
import lxml.html
|
|
from urllib.parse import urlparse
|
|
|
|
DOMAIN_UNKNOWN = -1
|
|
|
|
|
|
class DomainIdentifier:
|
|
"""
|
|
Makeshift-automated enum class to allow runtime expansion of an enum.
|
|
|
|
Create an instance to get a new unique domain identifier that can resolve to an int.
|
|
|
|
TODO find better way :D
|
|
"""
|
|
|
|
__domain_list = []
|
|
|
|
def __init__(self, name: str):
|
|
self.__name = name
|
|
self.__domain_number = len(DomainIdentifier.__domain_list)
|
|
|
|
if DomainIdentifier.get_identifier(name) is not None:
|
|
raise Exception("The domain \"{0}\" is already in the list! Tried to create a duplicate!".format(name))
|
|
|
|
DomainIdentifier.__domain_list.append(self)
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return self.__name
|
|
|
|
@property
|
|
def identifier(self) -> int:
|
|
return self.__domain_number
|
|
|
|
@staticmethod
|
|
def get_identifier(name: str) -> "DomainIdentifier":
|
|
for d in DomainIdentifier.__domain_list:
|
|
if d.name == name:
|
|
return d
|
|
return None
|
|
|
|
|
|
class DomainLinkGenerator:
|
|
|
|
def __init__(self, domain: DomainIdentifier):
|
|
self.__identifier = domain
|
|
|
|
def match_file_name(self, file_name) -> bool:
|
|
"""
|
|
Checks if a given file name is plausible to be used by the given domain
|
|
:param file_name:
|
|
:return:
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def get_domain_name(self) -> str:
|
|
return self.__identifier.name
|
|
|
|
def get_identifier(self) -> int:
|
|
"""
|
|
Return the Identifier for this Predictors domain.
|
|
:return:
|
|
"""
|
|
return self.__identifier.identifier
|
|
|
|
def construct_link(self, file_name: str) -> str:
|
|
"""
|
|
Construct a link by inserting the file_name into the known link pattern
|
|
:param file_name:
|
|
:return:
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def scrape_tags(self, url: str, headers: dict, file_name: str) -> list:
|
|
"""
|
|
Scrape the tags from the given url for all tags associated with the work.
|
|
|
|
The file_name can also be used to check the given url against prediction results.
|
|
:param url:
|
|
:param headers:
|
|
:param file_name:
|
|
:return:
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class LinkGenerator:
|
|
"""
|
|
Predict and generate valid links to the file
|
|
by matching the given file names against known patterns of the origin domain.
|
|
"""
|
|
|
|
__instance = None # Singleton holder
|
|
|
|
def __init__(self):
|
|
self.__link_generators = []
|
|
import ArtNet.web.domains # implements return_all_domains() which returns instances of all domains
|
|
# return_all_domains() is to return a list of all DomainLinkGenerator instances that are to be used
|
|
|
|
for p in ArtNet.web.domains.return_all_domains():
|
|
self.register_domain_predictor(p)
|
|
|
|
@staticmethod
|
|
def get_instance() -> "LinkGenerator":
|
|
"""
|
|
Gets the current instance
|
|
:return:
|
|
"""
|
|
if LinkGenerator.__instance is None:
|
|
LinkGenerator.__instance = LinkGenerator()
|
|
return LinkGenerator.__instance
|
|
|
|
def register_domain_predictor(self, predictor: DomainLinkGenerator):
|
|
"""
|
|
Register another DomainValidator to be used by this LinkPredictor
|
|
:param predictor:
|
|
:param domain: int identifier for the domain
|
|
:return:
|
|
"""
|
|
if predictor not in self.__link_generators:
|
|
self.__link_generators.append(predictor)
|
|
|
|
def predict_domain(self, file_name: str) -> int:
|
|
"""
|
|
Predict the possible domains of the given file by guessing via the filename
|
|
:param file_name:
|
|
:return:
|
|
"""
|
|
for g in self.__link_generators:
|
|
try:
|
|
if g.match_file_name(file_name): # TODO stop accepting first match
|
|
return g.get_identifier()
|
|
except NotImplementedError:
|
|
pass
|
|
return DOMAIN_UNKNOWN
|
|
|
|
def construct_link(self, file_name: str, domain: int) -> str:
|
|
"""
|
|
Construct a valid link to access the web page where the file name (hopefully) originated from.
|
|
:param file_name:
|
|
:param domain:
|
|
:return:
|
|
"""
|
|
for g in self.__link_generators:
|
|
if g.get_identifier() == domain: # TODO stop accepting first match
|
|
try:
|
|
return g.construct_link(file_name)
|
|
except NotImplementedError:
|
|
return None
|
|
|
|
return None
|
|
|
|
def scrape_tags(self, url: str, domain: int, file_name: str) -> dict:
|
|
"""
|
|
Scrapes the tags from the given url
|
|
:param url:
|
|
:param domain:
|
|
:param file_name:
|
|
:return:
|
|
"""
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:76.0) Gecko/20100101 Firefox/76.0"
|
|
}
|
|
url_domain = urlparse(url).netloc
|
|
for g in self.__link_generators:
|
|
if g.get_identifier() == domain or g.get_domain_name() == url_domain:
|
|
try:
|
|
return g.scrape_tags(url=url, headers=headers, file_name=file_name)
|
|
except NotImplementedError:
|
|
pass
|
|
return None
|