From: Fredrik Tolf Date: Wed, 13 Jan 2016 23:21:39 +0000 (+0100) Subject: Made a basic BS-based Python ANN module. X-Git-Url: http://git.dolda2000.com/gitweb/?a=commitdiff_plain;h=5b7914ac4a6aa32ac7b8417b286e6ba584fce636;hp=620488cac2bb28479f28d76e7cb1020deccd39a6;p=utils.git Made a basic BS-based Python ANN module. --- diff --git a/ann.py b/ann.py new file mode 100644 index 0000000..2f3970a --- /dev/null +++ b/ann.py @@ -0,0 +1,162 @@ +import os, hashlib, urllib.request, time, re, weakref +from urllib.parse import urljoin, urlencode +import bs4 +soup = lambda cont: bs4.BeautifulSoup(cont, "html.parser") + +base = "http://www.animenewsnetwork.com/encyclopedia/" + +class error(Exception): + pass + +class incompatible(error): + def __init__(self): + super().__init__("ANN HTML has changed") + +try: + cachedir = os.path.join(os.getenv("HOME"), ".ann", "cache") + if not os.path.isdir(cachedir): + os.makedirs(cachedir) +except: + cachedir = None + +def cachename(url): + if not cachedir: + return None + d = hashlib.md5() + d.update(url.encode("ascii")) + return os.path.join(cachedir, d.hexdigest()) + +def get(url): + data = None + cachefile = cachename(url) + if cachefile and os.path.exists(cachefile): + if time.time() - os.stat(cachefile).st_mtime < 86400: + with open(cachefile, "rb") as fp: + data = fp.read() + if data is None: + with urllib.request.urlopen(url) as fp: + data = fp.read() + if cachefile: + co = open(cachefile, "wb") + try: + co.write(data) + finally: + co.close() + return soup(data) + +def s(s, rx, rep): + m = re.search(rx, s, re.I) + if m: + return s[:m.start()] + rep + s[m.end():] + else: + return s + +def afind(soup, *args, **kwargs): + ret = soup.find(*args, **kwargs) + if ret is None: + raise incompatible() + return ret + +def cstr(soup): + if isinstance(soup, bs4.Tag) or isinstance(soup, list): + ret = "" + for el in soup: + ret += cstr(el) + return ret + else: + return soup.string + +class cproperty(object): + _default = object() + + def __init__(self, bk): + self.bk = bk + self.cache = weakref.WeakKeyDictionary() + + def __get__(self, ins, cls): + if ins is None: return self + ret = self.cache.get(ins, self._default) + if ret is self._default: + ret = self.bk(ins) + self.cache[ins] = ret + return ret + + def __set__(self, ins, val): + self.cache[ins] = val + + def __delete__(self, ins): + if ins in self.cache: + del self.cache[ins] + +class anime(object): + def __init__(self, id): + self.id = id + self.url = urljoin(base, "anime.php?id=%i" % self.id) + + @cproperty + def _page(self): + return get(self.url) + + @cproperty + def _main(self): + return afind(self._page, "div", id="maincontent") + + @cproperty + def _info(self): + ret = {} + for t in afind(self._main, "div", id="content-zone")("div", "encyc-info-type"): + if t.strong: + ret[t.strong.text.lower().strip()[:-1]] = t.contents[t.contents.index(t.strong) + 1:] + return ret + + @cproperty + def rawname(self): + afind(self._main, "h1", id="page_header").text + _nre = re.compile(r"^(.*\S)\s+\(([^\)]+)\)$") + @cproperty + def _sname(self): + m = self._nre.search(self.rawname) + if not m: + return (self.rawname, None) + return m.groups()[0:2] + @property + def name(self): return self._sname[0] + @property + def type(self): return self._sname[1] + + @cproperty + def eps(self): + return int(cstr(self._info["number of episodes"])) + + def __repr__(self): + return "" % (self.name, self.id) + + def __str__(self): + return self.name + +linkpat = re.compile("^/encyclopedia/anime\\.php\\?id=(\d+)$") +def getlist(name): + name = s(name, "^the\s+", "") + if len(name) < 1: + raise error("list() needs a prefix of at least one character") + fc = name[0] + if 'a' <= fc <= 'z' or 'A' <= fc <= 'Z': + fc = fc.upper() + else: + fc = '9' + d = get(urljoin(base, "anime.php?" + urlencode({"list": fc}))) + ret = [] + ldiv = afind(afind(d, "div", id="maincontent"), "div", "lst") + for link in ldiv("a", "HOVERLINE"): + mn = "" + for el in link.font: + if isinstance(el, str): + mn += el.strip() + if mn.lower().startswith(name.lower()): + m = linkpat.match(link["href"]) + if not m: + raise incompatible() + found = anime(int(m.groups()[0])) + found.rawname = mn + ret.append(found) + return ret