batoto.py

   1 import urllib, re, BeautifulSoup
   2 import lib, htcache
   3 soup = BeautifulSoup.BeautifulSoup
   4 soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)
   5
   6 def byclass(el, name, cl):
   7     for ch in el.findAll(name):
   8         if not isinstance(ch, BeautifulSoup.Tag): continue
   9         cll = ch.get("class", "")
  10         if cl in cll.split():
  11             return ch
  12     return None
  13
  14 def nextel(el):
  15     while True:
  16         el = el.nextSibling
  17         if isinstance(el, BeautifulSoup.Tag):
  18             return el
  19
  20 class page(lib.page):
  21     def __init__(self, chapter, stack, n, url):
  22         self.stack = stack
  23         self.chapter = chapter
  24         self.n = n
  25         self.id = str(n)
  26         self.name = u"Page %s" % n
  27         self.url = url
  28         self.ciurl = None
  29
  30     def iurl(self):
  31         if self.ciurl is None:
  32             page = soupify(htcache.fetch(self.url))
  33             img = nextel(page.find("div", id="full_image")).img
  34             self.ciurl = img["src"].encode("us-ascii")
  35         return self.ciurl
  36
  37     def open(self):
  38         return lib.stdimgstream(self.iurl())
  39
  40     def __str__(self):
  41         return self.name
  42
  43     def __repr(self):
  44         return "<batoto.page %r.%r.%r>" % (self.chapter.manga.name, self.chapter.name, self.name)
  45
  46 class chapter(lib.pagelist):
  47     def __init__(self, manga, stack, id, name, url):
  48         self.stack = stack
  49         self.manga = manga
  50         self.id = id
  51         self.name = name
  52         self.url = url
  53         self.cpag = None
  54
  55     def __getitem__(self, i):
  56         return self.pages()[i]
  57
  58     def __len__(self):
  59         return len(self.pages())
  60
  61     pnre = re.compile(r"page (\d+)")
  62     def pages(self):
  63         if self.cpag is None:
  64             pg = soupify(htcache.fetch(self.url))
  65             cpag = []
  66             for opt in pg.find("select", id="page_select").findAll("option"):
  67                 url = opt["value"].encode("us-ascii")
  68                 n = int(self.pnre.match(opt.string).group(1))
  69                 cpag.append(page(self, self.stack + [(self, len(cpag))], n, url))
  70             self.cpag = cpag
  71         return self.cpag
  72
  73     def __str__(self):
  74         return self.name
  75
  76     def __repr__(self):
  77         return "<batoto.chapter %r.%r>" % (self.manga.name, self.name)
  78
  79 class manga(lib.manga):
  80     def __init__(self, lib, id, name, url):
  81         self.lib = lib
  82         self.id = id
  83         self.name = name
  84         self.url = url
  85         self.cch = None
  86         self.stack = []
  87         self.cnames = None
  88
  89     def __getitem__(self, i):
  90         return self.ch()[i]
  91
  92     def __len__(self):
  93         return len(self.ch())
  94
  95     cure = re.compile(r"/read/_/(\d+)/[^/]*")
  96     def ch(self):
  97         if self.cch is None:
  98             page = soupify(htcache.fetch(self.url))
  99             cls = byclass(page, u"table", u"chapters_list")
 100             if cls.tbody is not None:
 101                 cls = cls.tbody
 102             scl = u"lang_" + self.lib.lang
 103             cch = []
 104             for ch in cls.childGenerator():
 105                 if isinstance(ch, BeautifulSoup.Tag) and ch.name == u"tr":
 106                     cll = ch.get("class", "").split()
 107                     if u"row" in cll and scl in cll:
 108                         url = ch.td.a["href"].encode("us-ascii")
 109                         m = self.cure.search(url)
 110                         if m is None: raise Exception("Got weird chapter URL: %r" % url)
 111                         cid = m.group(1)
 112                         url = self.lib.base + "read/_/" + cid
 113                         name = ch.td.a.text
 114                         cch.append((cid, name, url))
 115             cch.reverse()
 116             rch = []
 117             for n, (cid, name, url) in enumerate(cch):
 118                 rch.append(chapter(self, [(self, n)], cid, name, url))
 119             self.cch = rch
 120         return self.cch
 121
 122     def altnames(self):
 123         if self.cnames is None:
 124             page = soupify(htcache.fetch(self.url))
 125             cnames = None
 126             for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
 127                 if tbl.tbody is not None: tbl = tbl.tbody
 128                 for tr in tbl.findAll("tr"):
 129                     if u"Alt Names:" in tr.td.text:
 130                         nls = nextel(tr.td)
 131                         if nls.name != u"td" or nls.span is None:
 132                             raise Exception("Weird altnames table in " + self.id)
 133                         cnames = [nm.text.strip() for nm in nls.findAll("span")]
 134                         break
 135                 if cnames is not None:
 136                     break
 137             if cnames is None:
 138                 raise Exception("Could not find altnames for " + self.id)
 139             self.cnames = cnames
 140         return self.cnames
 141
 142     def __str__(self):
 143         return self.name
 144
 145     def __repr__(self):
 146         return "<batoto.manga %r>" % self.name
 147
 148 class library(lib.library):
 149     def __init__(self):
 150         self.base = "http://www.batoto.net/"
 151         self.lang = u"English"
 152
 153     def byid(self, id):
 154         url = self.base + "comic/_/comics/" + id
 155         page = soupify(htcache.fetch(url))
 156         title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
 157         if title is None:
 158             raise KeyError(id)
 159         return manga(self, id, title.string.strip(), url)
 160
 161     def _search(self, pars):
 162         p = 1
 163         while True:
 164             _pars = dict(pars)
 165             _pars["p"] = str(p)
 166             resp = urllib.urlopen(self.base + "search?" + urllib.urlencode(_pars))
 167             try:
 168                 page = soupify(resp.read())
 169             finally:
 170                 resp.close()
 171             rls = page.find("div", id="comic_search_results").table
 172             if rls.tbody is not None:
 173                 rls = rls.tbody
 174             hasmore = False
 175             for child in rls.findAll("tr"):
 176                 if child.th is not None: continue
 177                 if child.get("id", u"")[:11] == u"comic_rowo_": continue
 178                 if child.get("id") == u"show_more_row":
 179                     hasmore = True
 180                     continue
 181                 link = child.td.strong.a
 182                 url = link["href"].encode("us-ascii")
 183                 m = self.rure.search(url)
 184                 if m is None: raise Exception("Got weird manga URL: %r" % url)
 185                 id = m.group(1)
 186                 name = link.text.strip()
 187                 yield manga(self, id, name, url)
 188             p += 1
 189             if not hasmore:
 190                 break
 191
 192     rure = re.compile(r"/comic/_/([^/]*)$")
 193     def search(self, expr):
 194         if not isinstance(expr, unicode):
 195             expr = expr.decode("utf8")
 196         return self._search({"name": expr.encode("utf8"), "name_cond": "c"})
 197
 198     def byname(self, prefix):
 199         if not isinstance(prefix, unicode):
 200             prefix = prefix.decode("utf8")
 201         for res in self._search({"name": prefix.encode("utf8"), "name_cond": "s"}):
 202             if res.name[:len(prefix)].lower() == prefix.lower():
 203                 yield res
 204             else:
 205                 for aname in res.altnames():
 206                     if aname[:len(prefix)].lower() == prefix.lower():
 207                         yield manga(self, res.id, aname, res.url)
 208                         break
 209                 else:
 210                     if False:
 211                         print "eliding " + res.name
 212                         print res.altnames()