[automanga.git] / manga / batoto.py

import urllib, re, BeautifulSoup
import lib, htcache
soup = BeautifulSoup.BeautifulSoup
soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)

def byclass(el, name, cl):
    for ch in el.findAll(name):
        if not isinstance(ch, BeautifulSoup.Tag): continue
        cll = ch.get("class", "")
        if cl in cll.split():
            return ch
    return None

def nextel(el):
    while True:
        el = el.nextSibling
        if isinstance(el, BeautifulSoup.Tag):
            return el

class page(lib.page):
    def __init__(self, chapter, stack, n, url):
        self.stack = stack
        self.chapter = chapter
        self.n = n
        self.id = str(n)
        self.name = u"Page %s" % n
        self.url = url
        self.ciurl = None

    def iurl(self):
        if self.ciurl is None:
            page = soupify(htcache.fetch(self.url))
            img = nextel(page.find("div", id="full_image")).img
            self.ciurl = img["src"].encode("us-ascii")
        return self.ciurl

    def open(self):
        return lib.stdimgstream(self.iurl())

    def __str__(self):
        return self.name

    def __repr(self):
        return "<batoto.page %r.%r.%r>" % (self.chapter.manga.name, self.chapter.name, self.name)

class chapter(lib.pagelist):
    def __init__(self, manga, stack, id, name, url):
        self.stack = stack
        self.manga = manga
        self.id = id
        self.name = name
        self.url = url
        self.cpag = None

    def __getitem__(self, i):
        return self.pages()[i]

    def __len__(self):
        return len(self.pages())

    pnre = re.compile(r"page (\d+)")
    def pages(self):
        if self.cpag is None:
            pg = soupify(htcache.fetch(self.url))
            cpag = []
            for opt in pg.find("select", id="page_select").findAll("option"):
                url = opt["value"].encode("us-ascii")
                n = int(self.pnre.match(opt.string).group(1))
                cpag.append(page(self, self.stack + [(self, len(cpag))], n, url))
            self.cpag = cpag
        return self.cpag

    def __str__(self):
        return self.name

    def __repr__(self):
        return "<batoto.chapter %r.%r>" % (self.manga.name, self.name)

class manga(lib.manga):
    def __init__(self, lib, id, name, url):
        self.lib = lib
        self.id = id
        self.name = name
        self.url = url
        self.cch = None
        self.stack = []
        self.cnames = None

    def __getitem__(self, i):
        return self.ch()[i]

    def __len__(self):
        return len(self.ch())

    cure = re.compile(r"/read/_/(\d+)/[^/]*")
    def ch(self):
        if self.cch is None:
            page = soupify(htcache.fetch(self.url))
            cls = byclass(page, u"table", u"chapters_list")
            if cls.tbody is not None:
                cls = cls.tbody
            scl = u"lang_" + self.lib.lang
            cch = []
            for ch in cls.childGenerator():
                if isinstance(ch, BeautifulSoup.Tag) and ch.name == u"tr":
                    cll = ch.get("class", "").split()
                    if u"row" in cll and scl in cll:
                        url = ch.td.a["href"].encode("us-ascii")
                        m = self.cure.search(url)
                        if m is None: raise Exception("Got weird chapter URL: %r" % url)
                        cid = m.group(1)
                        url = self.lib.base + "read/_/" + cid
                        name = ch.td.a.text
                        cch.append((cid, name, url))
            cch.reverse()
            rch = []
            for n, (cid, name, url) in enumerate(cch):
                rch.append(chapter(self, [(self, n)], cid, name, url))
            self.cch = rch
        return self.cch

    def altnames(self):
        if self.cnames is None:
            page = soupify(htcache.fetch(self.url))
            cnames = None
            for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
                if tbl.tbody is not None: tbl = tbl.tbody
                for tr in tbl.findAll("tr"):
                    if u"Alt Names:" in tr.td.text:
                        nls = nextel(tr.td)
                        if nls.name != u"td" or nls.span is None:
                            raise Exception("Weird altnames table in " + self.id)
                        cnames = [nm.text.strip() for nm in nls.findAll("span")]
                        break
                if cnames is not None:
                    break
            if cnames is None:
                raise Exception("Could not find altnames for " + self.id)
            self.cnames = cnames
        return self.cnames

    def __str__(self):
        return self.name

    def __repr__(self):
        return "<batoto.manga %r>" % self.name

class library(lib.library):
    def __init__(self):
        self.base = "http://www.batoto.net/"
        self.lang = u"English"

    def byid(self, id):
        url = self.base + "comic/_/comics/" + id
        page = soupify(htcache.fetch(url))
        title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
        if title is None:
            raise KeyError(id)
        return manga(self, id, title.string.strip(), url)

    def _search(self, pars):
        p = 1
        while True:
            _pars = dict(pars)
            _pars["p"] = str(p)
            resp = urllib.urlopen(self.base + "search?" + urllib.urlencode(_pars))
            try:
                page = soupify(resp.read())
            finally:
                resp.close()
            rls = page.find("div", id="comic_search_results").table
            if rls.tbody is not None:
                rls = rls.tbody
            hasmore = False
            for child in rls.findAll("tr"):
                if child.th is not None: continue
                if child.get("id", u"")[:11] == u"comic_rowo_": continue
                if child.get("id") == u"show_more_row":
                    hasmore = True
                    continue
                link = child.td.strong.a
                url = link["href"].encode("us-ascii")
                m = self.rure.search(url)
                if m is None: raise Exception("Got weird manga URL: %r" % url)
                id = m.group(1)
                name = link.text.strip()
                yield manga(self, id, name, url)
            p += 1
            if not hasmore:
                break

    rure = re.compile(r"/comic/_/([^/]*)$")
    def search(self, expr):
        if not isinstance(expr, unicode):
            expr = expr.decode("utf8")
        return self._search({"name": expr.encode("utf8"), "name_cond": "c"})

    def byname(self, prefix):
        if not isinstance(prefix, unicode):
            prefix = prefix.decode("utf8")
        for res in self._search({"name": prefix.encode("utf8"), "name_cond": "s"}):
            if res.name[:len(prefix)].lower() == prefix.lower():
                yield res
            else:
                for aname in res.altnames():
                    if aname[:len(prefix)].lower() == prefix.lower():
                        yield manga(self, res.id, aname, res.url)
                        break
                else:
                    if False:
                        print "eliding " + res.name
                        print res.altnames()
Commit	Line	Data
	1	import urllib, re, BeautifulSoup
	2	import lib, htcache
	3	soup = BeautifulSoup.BeautifulSoup
	4	soupify = lambda cont: soup(cont, convertEntities=soup.HTML_ENTITIES)
	5
	6	def byclass(el, name, cl):
	7	for ch in el.findAll(name):
	8	if not isinstance(ch, BeautifulSoup.Tag): continue
	9	cll = ch.get("class", "")
	10	if cl in cll.split():
	11	return ch
	12	return None
	13
	14	def nextel(el):
	15	while True:
	16	el = el.nextSibling
	17	if isinstance(el, BeautifulSoup.Tag):
	18	return el
	19
	20	class page(lib.page):
	21	def __init__(self, chapter, stack, n, url):
	22	self.stack = stack
	23	self.chapter = chapter
	24	self.n = n
	25	self.id = str(n)
	26	self.name = u"Page %s" % n
	27	self.url = url
	28	self.ciurl = None
	29
	30	def iurl(self):
	31	if self.ciurl is None:
	32	page = soupify(htcache.fetch(self.url))
	33	img = nextel(page.find("div", id="full_image")).img
	34	self.ciurl = img["src"].encode("us-ascii")
	35	return self.ciurl
	36
	37	def open(self):
	38	return lib.stdimgstream(self.iurl())
	39
	40	def __str__(self):
	41	return self.name
	42
	43	def __repr(self):
	44	return "<batoto.page %r.%r.%r>" % (self.chapter.manga.name, self.chapter.name, self.name)
	45
	46	class chapter(lib.pagelist):
	47	def __init__(self, manga, stack, id, name, url):
	48	self.stack = stack
	49	self.manga = manga
	50	self.id = id
	51	self.name = name
	52	self.url = url
	53	self.cpag = None
	54
	55	def __getitem__(self, i):
	56	return self.pages()[i]
	57
	58	def __len__(self):
	59	return len(self.pages())
	60
	61	pnre = re.compile(r"page (\d+)")
	62	def pages(self):
	63	if self.cpag is None:
	64	pg = soupify(htcache.fetch(self.url))
	65	cpag = []
	66	for opt in pg.find("select", id="page_select").findAll("option"):
	67	url = opt["value"].encode("us-ascii")
	68	n = int(self.pnre.match(opt.string).group(1))
	69	cpag.append(page(self, self.stack + [(self, len(cpag))], n, url))
	70	self.cpag = cpag
	71	return self.cpag
	72
	73	def __str__(self):
	74	return self.name
	75
	76	def __repr__(self):
	77	return "<batoto.chapter %r.%r>" % (self.manga.name, self.name)
	78
	79	class manga(lib.manga):
	80	def __init__(self, lib, id, name, url):
	81	self.lib = lib
	82	self.id = id
	83	self.name = name
	84	self.url = url
	85	self.cch = None
	86	self.stack = []
	87	self.cnames = None
	88
	89	def __getitem__(self, i):
	90	return self.ch()[i]
	91
	92	def __len__(self):
	93	return len(self.ch())
	94
	95	cure = re.compile(r"/read/_/(\d+)/[^/]*")
	96	def ch(self):
	97	if self.cch is None:
	98	page = soupify(htcache.fetch(self.url))
	99	cls = byclass(page, u"table", u"chapters_list")
	100	if cls.tbody is not None:
	101	cls = cls.tbody
	102	scl = u"lang_" + self.lib.lang
	103	cch = []
	104	for ch in cls.childGenerator():
	105	if isinstance(ch, BeautifulSoup.Tag) and ch.name == u"tr":
	106	cll = ch.get("class", "").split()
	107	if u"row" in cll and scl in cll:
	108	url = ch.td.a["href"].encode("us-ascii")
	109	m = self.cure.search(url)
	110	if m is None: raise Exception("Got weird chapter URL: %r" % url)
	111	cid = m.group(1)
	112	url = self.lib.base + "read/_/" + cid
	113	name = ch.td.a.text
	114	cch.append((cid, name, url))
	115	cch.reverse()
	116	rch = []
	117	for n, (cid, name, url) in enumerate(cch):
	118	rch.append(chapter(self, [(self, n)], cid, name, url))
	119	self.cch = rch
	120	return self.cch
	121
	122	def altnames(self):
	123	if self.cnames is None:
	124	page = soupify(htcache.fetch(self.url))
	125	cnames = None
	126	for tbl in page.findAll("table", attrs={"class": "ipb_table"}):
	127	if tbl.tbody is not None: tbl = tbl.tbody
	128	for tr in tbl.findAll("tr"):
	129	if u"Alt Names:" in tr.td.text:
	130	nls = nextel(tr.td)
	131	if nls.name != u"td" or nls.span is None:
	132	raise Exception("Weird altnames table in " + self.id)
	133	cnames = [nm.text.strip() for nm in nls.findAll("span")]
	134	break
	135	if cnames is not None:
	136	break
	137	if cnames is None:
	138	raise Exception("Could not find altnames for " + self.id)
	139	self.cnames = cnames
	140	return self.cnames
	141
	142	def __str__(self):
	143	return self.name
	144
	145	def __repr__(self):
	146	return "<batoto.manga %r>" % self.name
	147
	148	class library(lib.library):
	149	def __init__(self):
	150	self.base = "http://www.batoto.net/"
	151	self.lang = u"English"
	152
	153	def byid(self, id):
	154	url = self.base + "comic/_/comics/" + id
	155	page = soupify(htcache.fetch(url))
	156	title = page.find("h1", attrs={"class": "ipsType_pagetitle"})
	157	if title is None:
	158	raise KeyError(id)
	159	return manga(self, id, title.string.strip(), url)
	160
	161	def _search(self, pars):
	162	p = 1
	163	while True:
	164	_pars = dict(pars)
	165	_pars["p"] = str(p)
	166	resp = urllib.urlopen(self.base + "search?" + urllib.urlencode(_pars))
	167	try:
	168	page = soupify(resp.read())
	169	finally:
	170	resp.close()
	171	rls = page.find("div", id="comic_search_results").table
	172	if rls.tbody is not None:
	173	rls = rls.tbody
	174	hasmore = False
	175	for child in rls.findAll("tr"):
	176	if child.th is not None: continue
	177	if child.get("id", u"")[:11] == u"comic_rowo_": continue
	178	if child.get("id") == u"show_more_row":
	179	hasmore = True
	180	continue
	181	link = child.td.strong.a
	182	url = link["href"].encode("us-ascii")
	183	m = self.rure.search(url)
	184	if m is None: raise Exception("Got weird manga URL: %r" % url)
	185	id = m.group(1)
	186	name = link.text.strip()
	187	yield manga(self, id, name, url)
	188	p += 1
	189	if not hasmore:
	190	break
	191
	192	rure = re.compile(r"/comic/_/([^/]*)$")
	193	def search(self, expr):
	194	if not isinstance(expr, unicode):
	195	expr = expr.decode("utf8")
	196	return self._search({"name": expr.encode("utf8"), "name_cond": "c"})
	197
	198	def byname(self, prefix):
	199	if not isinstance(prefix, unicode):
	200	prefix = prefix.decode("utf8")
	201	for res in self._search({"name": prefix.encode("utf8"), "name_cond": "s"}):
	202	if res.name[:len(prefix)].lower() == prefix.lower():
	203	yield res
	204	else:
	205	for aname in res.altnames():
	206	if aname[:len(prefix)].lower() == prefix.lower():
	207	yield manga(self, res.id, aname, res.url)
	208	break
	209	else:
	210	if False:
	211	print "eliding " + res.name
	212	print res.altnames()