1 --- a/MANIFEST.in Thu Jul 03 17:11:35 2008 +0200
2 +++ b/MANIFEST.in Thu Jul 03 17:11:44 2008 +0200
3 @@ -63,6 +63,7 @@
4 include mwlib/serve.py
5 include mwlib/snippets.py
6 include mwlib/snippets.txt
7 +include mwlib/tagext.py
8 include mwlib/texmap.py
9 include mwlib/timeline.py
10 include mwlib/uparser.py
11 @@ -97,6 +98,7 @@
12 include tests/test_sanitychecker.py
13 include tests/test_scanner.py
14 include tests/test_table.py
15 +include tests/test_tagext.py
16 include tests/test_timeline.py
17 include tests/test_utils.py
18 include tests/test_xhtmlwriter.py
1.1 --- a/docs/metabook.txt Thu Jul 03 17:11:35 2008 +0200
1.2 +++ b/docs/metabook.txt Thu Jul 03 17:11:44 2008 +0200
1.3 @@ -73,6 +73,10 @@
1.4 name (string):
1.5
1.6 Unique name of source, e.g. "Wikipedia (en)"
1.7 +
1.8 +language (string)
1.9 +
1.10 + 2-character ISO code of language, e.g. "en"
1.11
1.12
1.13 License
2.1 --- a/mwlib/cdb.py Thu Jul 03 17:11:35 2008 +0200
2.2 +++ b/mwlib/cdb.py Thu Jul 03 17:11:44 2008 +0200
2.3 @@ -48,7 +48,7 @@
2.4 def close(self):
2.5 self.map.close()
2.6
2.7 - def __iter__(self, fn=None):
2.8 + def __iter__(self):
2.9 len = 2048
2.10 while len < self.eod:
2.11 klen, vlen = struct.unpack("<LL", self.map[len:len+8])
2.12 @@ -57,37 +57,25 @@
2.13 len += klen
2.14 val = self.map[len:len+vlen]
2.15 len += vlen
2.16 - if fn:
2.17 - yield fn(key, val)
2.18 - else:
2.19 - yield (key, val)
2.20 + yield (key, val)
2.21
2.22 def iteritems(self):
2.23 return self.__iter__()
2.24
2.25 def iterkeys(self):
2.26 - return self.__iter__(lambda k,v: k)
2.27 + return (k for k, v in self)
2.28
2.29 def itervalues(self):
2.30 - return self.__iter__(lambda k,v: v)
2.31 + return (v for k, v in self)
2.32
2.33 def items(self):
2.34 - ret = []
2.35 - for i in self.iteritems():
2.36 - ret.append(i)
2.37 - return ret
2.38 + return list(self.iteritems())
2.39
2.40 def keys(self):
2.41 - ret = []
2.42 - for i in self.iterkeys():
2.43 - ret.append(i)
2.44 - return ret
2.45 + return list(self.iterkeys())
2.46
2.47 def values(self):
2.48 - ret = []
2.49 - for i in self.itervalues():
2.50 - ret.append(i)
2.51 - return ret
2.52 + return list(self.itervalues())
2.53
2.54 def findstart(self):
2.55 self.loop = 0
3.1 --- a/mwlib/cdbwiki.py Thu Jul 03 17:11:35 2008 +0200
3.2 +++ b/mwlib/cdbwiki.py Thu Jul 03 17:11:44 2008 +0200
3.3 @@ -8,206 +8,129 @@
3.4 import zlib
3.5 import re
3.6
3.7 -from mwlib import cdb
3.8 -
3.9 -try:
3.10 - from xml.etree import cElementTree
3.11 -except ImportError:
3.12 - import cElementTree
3.13 -
3.14 -ns = '{http://www.mediawiki.org/xml/export-0.3/}'
3.15 -
3.16 -wikiindex = "wikiidx"
3.17 -wikidata = "wikidata.bin"
3.18 -
3.19 -
3.20 +from mwlib import cdb, dumpparser
3.21
3.22 def normname(name):
3.23 name = name.strip().replace("_", " ")
3.24 name = name[:1].upper()+name[1:]
3.25 return name
3.26
3.27 -class Tags:
3.28 - page = ns + 'page'
3.29 +class ZCdbWriter(cdb.CdbMake):
3.30 + def __init__(self, indexpath, datapath=None):
3.31 + if not datapath:
3.32 + datapath = indexpath + 'data.bin'
3.33 + indexpath = indexpath + 'idx.cdb'
3.34
3.35 - # <title> inside <page>
3.36 - title = ns + 'title'
3.37 + cdb.CdbMake.__init__(self, open(indexpath, 'wb'))
3.38 + self.data = open(datapath, 'wb')
3.39
3.40 - # <revision> inside <page>
3.41 - revision = ns + 'revision'
3.42 + def add(self, key, val):
3.43 + key = key.encode("utf-8")
3.44 + val = zlib.compress(val.encode('utf-8')) # NOTE: encode wasn't in original
3.45 + pos = self.data.tell()
3.46 + self.data.write(val)
3.47 + cdb.CdbMake.add(self, key, "%s %s" % (pos, len(val)))
3.48
3.49 - # <id> inside <revision>
3.50 - revid = ns + 'id'
3.51 + def finish(self):
3.52 + cdb.CdbMake.finish(self)
3.53 + self.data.close()
3.54
3.55 - # <contributor><username> inside <revision>
3.56 - username = ns + 'contributor/' + ns + 'username'
3.57
3.58 - # <text> inside <revision>
3.59 - text = ns + 'text'
3.60 +class ZCdbReader(cdb.Cdb):
3.61 + def __init__(self, indexpath, datapath=None):
3.62 + if not datapath:
3.63 + datapath = indexpath + 'data.bin'
3.64 + indexpath = indexpath + 'idx.cdb'
3.65
3.66 - # <timestamp> inside <revision>
3.67 - timestamp = ns + 'timestamp'
3.68 + cdb.Cdb.__init__(self, open(indexpath, 'rb'))
3.69 + self.datapath = datapath
3.70
3.71 - # <revision><text> inside <page>
3.72 - revision_text = ns + 'revision/' + ns + 'text'
3.73 + def __getitem__(self, key):
3.74 + key = key.encode("utf-8")
3.75 + data = cdb.Cdb.__getitem__(self, key) # may raise KeyError
3.76 + return self._readz(data)
3.77
3.78 - siteinfo = ns + "siteinfo"
3.79 + def _readz(self, data):
3.80 + pos, len = map(int, data.split())
3.81 +
3.82 + f=open(self.datapath, "rb")
3.83 + f.seek(pos)
3.84 + d=f.read(len)
3.85 + f.close()
3.86 + return zlib.decompress(d).decode('utf-8')
3.87
3.88 -class DumpParser(object):
3.89 - category_ns = set(['category', 'kategorie'])
3.90 - image_ns = set(['image', 'bild'])
3.91 - template_ns = set(['template', 'vorlage'])
3.92 - wikipedia_ns = set(['wikipedia'])
3.93 + def iterkeys(self):
3.94 + return (k.decode('utf-8') for k in cdb.Cdb.iterkeys(self))
3.95
3.96 - tags = Tags()
3.97 + def iteritems(self):
3.98 + return ((k.decode('utf-8'), self._readz(v))
3.99 + for k,v in cdb.Cdb.iteritems(self))
3.100
3.101 + def itervalues(self):
3.102 + return (self._readz(v) for v in cdb.Cdb.itervalues(self))
3.103
3.104 - def __init__(self, xmlfilename):
3.105 - self.xmlfilename = xmlfilename
3.106
3.107 - def _write(self, msg):
3.108 - sys.stdout.write(msg)
3.109 - sys.stdout.flush()
3.110 -
3.111 - def openInputStream(self):
3.112 - if self.xmlfilename.lower().endswith(".bz2"):
3.113 - f = os.popen("bunzip2 -c %s" % self.xmlfilename, "r")
3.114 - elif self.xmlfilename.lower().endswith(".7z"):
3.115 - f = os.popen("7z -so x %s" % self.xmlfilename, "r")
3.116 +class BuildWiki():
3.117 + def __init__(self, dumpfile, outputdir, prefix='wiki'):
3.118 + if type(dumpfile) in (type(''), type(u'')):
3.119 + self.dumpParser = dumpparser.DumpParser(dumpfile)
3.120 else:
3.121 - f = open(self.xmlfilename, "r")
3.122 -
3.123 - return f
3.124 -
3.125 - def __call__(self):
3.126 - f = self.openInputStream()
3.127 -
3.128 - count = 0
3.129 - for event, elem in cElementTree.iterparse(f):
3.130 - if elem.tag != self.tags.page:
3.131 - continue
3.132 - self.handlePageElement(elem)
3.133 - elem.clear()
3.134 - count += 1
3.135 -
3.136 - if count % 5000 == 0:
3.137 - self._write(" %s\n" % count)
3.138 - elif count % 100 == 0:
3.139 - self._write(".")
3.140 -
3.141 -
3.142 - def handlePageElement(self, page):
3.143 - title = page.find(self.tags.title).text
3.144 - revisions = page.findall(self.tags.revision)
3.145 - if not revisions:
3.146 - return
3.147 - revision = revisions[-1]
3.148 -
3.149 - texttag = revision.find(self.tags.text)
3.150 - timestamptag = revision.find(self.tags.timestamp)
3.151 - revision.clear()
3.152 -
3.153 - if texttag is not None:
3.154 - text = texttag.text
3.155 - texttag.clear()
3.156 - else:
3.157 - text = None
3.158 -
3.159 - if timestamptag is not None:
3.160 - timestamp = timestamptag.text
3.161 - timestamptag.clear()
3.162 - else:
3.163 - timestamp = None
3.164 -
3.165 - if not text:
3.166 - return
3.167 -
3.168 - if isinstance(title, str):
3.169 - title = unicode(title)
3.170 - if isinstance(text, str):
3.171 - text = unicode(text)
3.172 -
3.173 -
3.174 - if ':' in title:
3.175 - ns, rest = title.split(':', 1)
3.176 - ns = ns.lower()
3.177 - if ns not in self.template_ns:
3.178 - return
3.179 - self.handleTemplate(rest, text, timestamp)
3.180 - else:
3.181 - self.handleArticle(title, text, timestamp)
3.182 -
3.183 - def handleArticle(self, title, text, timestamp):
3.184 - print "ART:", repr(title), len(text), timestamp
3.185 -
3.186 - def handleTemplate(self, title, text, timestamp):
3.187 - print "TEMPL:", repr(title), len(text), timestamp
3.188 -
3.189 -class BuildWiki(DumpParser):
3.190 - def __init__(self, xmlfilename, outputdir):
3.191 - DumpParser.__init__(self, xmlfilename)
3.192 + self.dumpParser = dumpfile
3.193 + self.output_path = os.path.join(outputdir, prefix)
3.194 self.outputdir = outputdir
3.195
3.196 def __call__(self):
3.197 if not os.path.exists(self.outputdir):
3.198 os.makedirs(self.outputdir)
3.199
3.200 - n = os.path.join(self.outputdir, wikiindex)
3.201 - out = open(os.path.join(self.outputdir, wikidata), "wb")
3.202 - self.out = out
3.203 - f = open(n+'.cdb', 'wb')
3.204 - c = cdb.CdbMake(f)
3.205 - self.cdb = c
3.206 + self.writer = ZCdbWriter(self.output_path)
3.207
3.208 - DumpParser.__call__(self)
3.209 - c.finish()
3.210 - f.close()
3.211 + count = 0
3.212 + for page in self.dumpParser:
3.213 + if page.namespace == dumpparser.NS_MAIN:
3.214 + self.handleArticle(page.title, page.text, page.timestamp)
3.215 + elif page.namespace == dumpparser.NS_TEMPLATE:
3.216 + self.handleTemplate(page.title, page.text, page.timestamp)
3.217 + else:
3.218 + self.handleOther(page.title, page.text, page.timestamp)
3.219
3.220 + count += 1
3.221 + if count % 5000 == 0:
3.222 + self._write(" %s\n" % count)
3.223 + elif count % 100 == 0:
3.224 + self._write(".")
3.225 +
3.226 + self.writer.finish()
3.227
3.228 - def _writeobj(self, key, val):
3.229 - key = key.encode("utf-8")
3.230 - val = zlib.compress(val)
3.231 - pos = self.out.tell()
3.232 - self.out.write(val)
3.233 - self.cdb.add(key, "%s %s" % (pos, len(val)))
3.234 + def _write(self, msg):
3.235 + sys.stdout.write(msg)
3.236 + sys.stdout.flush()
3.237
3.238 def handleArticle(self, title, text, timestamp):
3.239 - self._writeobj(u":"+title, text.encode("utf-8"))
3.240 + self.writer.add(u":"+title, text)
3.241
3.242 def handleTemplate(self, title, text, timestamp):
3.243 - self._writeobj(u"T:"+title, text.encode("utf-8"))
3.244 + self.writer.add(u"T:"+title, text)
3.245 +
3.246 + def handleOther(self, title, text, timestamp):
3.247 + self.writer.add(title, text)
3.248
3.249
3.250
3.251 class WikiDB(object):
3.252 redirect_rex = re.compile(r'^#Redirect:?\s*?\[\[(?P<redirect>.*?)\]\]', re.IGNORECASE)
3.253
3.254 - def __init__(self, dir):
3.255 + def __init__(self, dir, prefix='wiki'):
3.256 self.dir = dir
3.257 - self.obj2pos_path = os.path.join(self.dir, wikidata)
3.258 - self.cdb = cdb.Cdb(open(os.path.join(self.dir, wikiindex+'.cdb'), 'rb'))
3.259 -
3.260 - def _readobj(self, key):
3.261 - key = key.encode("utf-8")
3.262 -
3.263 - try:
3.264 - data = self.cdb[key]
3.265 - except KeyError:
3.266 - return None
3.267 -
3.268 - pos, len = map(int, data.split())
3.269 -
3.270 - f=open(self.obj2pos_path, "rb")
3.271 - f.seek(pos)
3.272 - d=f.read(len)
3.273 - f.close()
3.274 - return zlib.decompress(d)
3.275 + self.reader = ZCdbReader(os.path.join(self.dir, prefix))
3.276
3.277 def getRawArticle(self, title, raw=None, revision=None):
3.278 title = normname(title)
3.279 - res = self._readobj(":"+title)
3.280 - if res is None:
3.281 - return None
3.282 + print repr(title)
3.283 + try:
3.284 + res = self.reader[":"+title]
3.285 + except KeyError:
3.286 + return None
3.287
3.288 res = unicode(res, 'utf-8')
3.289 mo = self.redirect_rex.search(res)
3.290 @@ -224,9 +147,10 @@
3.291 title = title.split(':', 1)[1]
3.292
3.293 title = normname(title)
3.294 - res = unicode(self._readobj(u"T:"+title) or "", 'utf-8')
3.295 - if not res:
3.296 - return res
3.297 + try:
3.298 + res = self.reader["T:"+title]
3.299 + except KeyError:
3.300 + return ''
3.301
3.302 mo = self.redirect_rex.search(res)
3.303 if mo:
3.304 @@ -237,7 +161,12 @@
3.305
3.306
3.307 def articles(self):
3.308 - for k, v in self.cdb:
3.309 - if k[0]==':':
3.310 - k = unicode(k[1:], "utf-8")
3.311 - yield k
3.312 + return (k[1:]
3.313 + for k in self.reader.iterkeys()
3.314 + if k[0] == ':')
3.315 +
3.316 + def article_texts(self):
3.317 + return ((k[1:], v)
3.318 + for k in self.reader.iteritems()
3.319 + if k[0] == ':')
3.320 +
4.1 --- a/mwlib/metabook.py Thu Jul 03 17:11:35 2008 +0200
4.2 +++ b/mwlib/metabook.py Thu Jul 03 17:11:44 2008 +0200
4.3 @@ -20,7 +20,7 @@
4.4 metabook['subtitle'] = subtitle
4.5 return metabook
4.6
4.7 -def make_source(name=None, url=None):
4.8 +def make_source(name=None, url=None, language=None):
4.9 source = {
4.10 'type': 'source',
4.11 'system': 'MediaWiki',
4.12 @@ -29,6 +29,8 @@
4.13 source['name'] = name
4.14 if url:
4.15 source['url'] = url
4.16 + if language:
4.17 + source['language'] = language
4.18 return source
4.19
4.20 def make_article(title=None, displaytitle=None, content_type='text/x-wiki'):
5.1 --- a/mwlib/mwapidb.py Thu Jul 03 17:11:35 2008 +0200
5.2 +++ b/mwlib/mwapidb.py Thu Jul 03 17:11:44 2008 +0200
5.3 @@ -416,6 +416,7 @@
5.4 self.template_blacklist = []
5.5 if template_blacklist is not None:
5.6 self.setTemplateBlacklist(template_blacklist)
5.7 + self.source = None
5.8
5.9 def setTemplateBlacklist(self, template_blacklist):
5.10 raw = self.getRawArticle(template_blacklist)
5.11 @@ -525,14 +526,18 @@
5.12 except KeyError:
5.13 return None
5.14
5.15 - def getMetaData(self):
5.16 + def getSource(self):
5.17 + if self.source is not None:
5.18 + return self.source
5.19 result = self.api_helper.query(meta='siteinfo')
5.20 try:
5.21 g = result['general']
5.22 - return metabook.make_source(
5.23 + self.source = metabook.make_source(
5.24 url=g['base'],
5.25 name='%s (%s)' % (g['sitename'], g['lang']),
5.26 + language=g['lang'],
5.27 )
5.28 + return self.source
5.29 except KeyError:
5.30 return None
5.31
6.1 --- a/mwlib/options.py Thu Jul 03 17:11:35 2008 +0200
6.2 +++ b/mwlib/options.py Thu Jul 03 17:11:44 2008 +0200
6.3 @@ -79,5 +79,6 @@
6.4 self.options.collectionpage,
6.5 ))
6.6 self.metabook = metabook.parse_collection_page(wikitext)
6.7 + env.metabook = self.metabook
6.8 return env
6.9
7.1 --- a/mwlib/parser.py Thu Jul 03 17:11:35 2008 +0200
7.2 +++ b/mwlib/parser.py Thu Jul 03 17:11:44 2008 +0200
7.3 @@ -9,6 +9,8 @@
7.4
7.5 from mwlib.scanner import tokenize, TagToken, EndTagToken
7.6 from mwlib.log import Log
7.7 +from mwlib.namespace import namespace_maps, interwiki_map
7.8 +from mwlib.lang import languages
7.9
7.10 log = Log("parser")
7.11
7.12 @@ -193,82 +195,165 @@
7.13
7.14 class Link(Node):
7.15 target = None
7.16 - specialPrefixes = set(["wikipedia", "wiktionary", "wikibooks", "wikisource",
7.17 - "wikiquote", "meta", "talk",
7.18 - "commons", "wikinews", "template", "wikitravel", "help", "vorlage"])
7.19 - from mwlib.lang import languages
7.20 + from mwlib.namespace import NS_MAIN, NS_CATEGORY, NS_IMAGE
7.21 +
7.22 colon = False
7.23
7.24 def hasContent(self):
7.25 if self.target:
7.26 return True
7.27 return False
7.28 +
7.29 + @classmethod
7.30 + def _buildSpecializeMap(cls, namespaces, interwikis, langs):
7.31 + """
7.32 + Returns a dict mapping namespace prefixes to a tuple of form
7.33 + (link_class, namespace_value).
7.34 + """
7.35 + res = {}
7.36 + for name, num in namespaces.iteritems():
7.37 + name = name.lower()
7.38 + if num == cls.NS_CATEGORY:
7.39 + res[name] = (CategoryLink, num)
7.40 + elif num == cls.NS_IMAGE:
7.41 + res[name] = (ImageLink, num)
7.42 + else:
7.43 + res[name] = (NamespaceLink, num)
7.44 +
7.45 + for name, target in interwikis.iteritems():
7.46 + res[name.lower()] = (InterwikiLink, target)
7.47 +
7.48 + for lang in langs:
7.49 + res[lang.lower()] = (LangLink, lang)
7.50 +
7.51 + return res
7.52
7.53 + @classmethod
7.54 + def _setSpecializeMap(cls, nsMap='default'):
7.55 + cls._specializeMap = cls._buildSpecializeMap(
7.56 + namespace_maps[nsMap], interwiki_map, languages)
7.57 +
7.58 def _specialize(self):
7.59 + """
7.60 + Handles different forms of link, e.g.:
7.61 + - [[Foo]]
7.62 + - [[Foo|Bar]]
7.63 + - [[Category:Foo]]
7.64 + - [[:Category:Foo]]
7.65 + """
7.66 +
7.67 if not self.children:
7.68 return
7.69
7.70 if type(self.children[0]) != Text:
7.71 return
7.72
7.73 - self.target = target = self.children[0].caption.strip()
7.74 + # Handle [[Foo|Bar]]
7.75 + full_target = self.children[0].caption.strip()
7.76 del self.children[0]
7.77 if self.children and self.children[0] == Control("|"):
7.78 del self.children[0]
7.79 +
7.80 + # Mark [[:Category:Foo]]. See below
7.81 + if full_target.startswith(':'):
7.82 + self.colon = True
7.83 + full_target = full_target[1:]
7.84 + self.full_target = full_target
7.85
7.86 - pic = self.target
7.87 - if pic.startswith(':'):
7.88 - self.colon = True
7.89 -
7.90 -
7.91 -
7.92 - # pic == "Bild:Wappen_von_Budenheim.png"
7.93 -
7.94 - pic = pic.strip(': ')
7.95 - if ':' not in pic:
7.96 - return
7.97 -
7.98 - linktype, pic = pic.split(':', 1)
7.99 - linktype = linktype.lower().strip(" :")
7.100 -
7.101 - if linktype in ("category", "kategorie"):
7.102 - self.__class__ = CategoryLink
7.103 - self.target = pic.strip()
7.104 + try:
7.105 + ns, title = full_target.split(':', 1)
7.106 + except ValueError:
7.107 + self.namespace = self.NS_MAIN
7.108 + self.target = full_target
7.109 + self.__class__ = ArticleLink
7.110 return
7.111
7.112 - if linktype in self.specialPrefixes:
7.113 - self.__class__ = SpecialLink
7.114 - self.target = pic.strip()
7.115 - self.ns = linktype
7.116 + (self.__class__, self.namespace) = (
7.117 + self._specializeMap.get(ns.lower(), (ArticleLink, self.NS_MAIN)))
7.118
7.119 + if len(ns) == 2:
7.120 + # Assume this is an unlisted language
7.121 + self.__class__ = LangLink
7.122 + self.namespace = ns.lower()
7.123 +
7.124 + if self.colon and self.namespace != self.NS_MAIN:
7.125 + # [[:Category:Foo]] should not be a category link
7.126 + self.__class__ = NamespaceLink
7.127 +
7.128 + if self.namespace == self.NS_MAIN:
7.129 + # e.g. [[Blah: Foo]] is an ordinary article with a colon
7.130 + self.target = full_target
7.131 + else:
7.132 + self.target = title
7.133 +
7.134 + if self.__class__ == ImageLink:
7.135 + # Handle images. First ensure they are syntactically sound.
7.136 +
7.137 + try:
7.138 + prefix, suffix = title.rsplit('.', 1)
7.139 + if suffix.lower() in ['jpg', 'jpeg', 'gif', 'png', 'svg']:
7.140 + self._readArgs() # calls Image._readArgs()
7.141 + return
7.142 + except ValueError:
7.143 + pass
7.144 + # We can't handle this as an image, so default:
7.145 + self.__class__ = NamespaceLink
7.146 +
7.147 +
7.148 + capitalizeTarget = False # Wiki-dependent setting, e.g. Wikipedia => True
7.149 +
7.150 + _SPACE_RE = re.compile('[_\s]+')
7.151 + def _normalizeTarget(self):
7.152 + """
7.153 + Normalizes the format of the target with regards to whitespace and
7.154 + capitalization (depending on capitalizeTarget setting).
7.155 + """
7.156 +
7.157 + if not self.target:
7.158 return
7.159
7.160 - if linktype in self.languages:
7.161 - self.__class__ = LangLink
7.162 - return
7.163 -
7.164 -
7.165 - if linktype not in ("bild", "image", "imagen"):
7.166 - # assume a LangLink
7.167 - log.info("Unknown linktype:", repr(linktype))
7.168 - if len(linktype)==2:
7.169 - self.__class__ = LangLink
7.170 - return
7.171 -
7.172 -
7.173 - # pic == "Wappen_von_Budenheim.png"
7.174 -
7.175 - try:
7.176 - prefix, suffix = pic.rsplit('.', 1)
7.177 - except ValueError:
7.178 - return
7.179 + # really we should have a urllib.unquote() first, but in practice this
7.180 + # format may be rare enough to ignore
7.181
7.182 - if suffix.lower() in ['jpg', 'jpeg', 'gif', 'png', 'svg']:
7.183 - self.__class__ = ImageLink
7.184 - self.target = pic.strip()
7.185 + # [[__init__]] -> [[init]]
7.186 + self.target = self._SPACE_RE.sub(' ', self.target).strip()
7.187 + if self.capitalizeTarget:
7.188 + self.target = self.target[:1].upper() + self.target[1:]
7.189
7.190
7.191 +# Link forms:
7.192
7.193 +class ArticleLink(Link):
7.194 + pass
7.195 +
7.196 +class SpecialLink(Link):
7.197 + pass
7.198 +
7.199 +class NamespaceLink(SpecialLink):
7.200 + pass
7.201 +
7.202 +class InterwikiLink(SpecialLink):
7.203 + pass
7.204 +
7.205 +# Non-links with same syntax:
7.206 +
7.207 +class LangLink(Link):
7.208 + pass
7.209 +
7.210 +class CategoryLink(Link):
7.211 + pass
7.212 +
7.213 +class ImageLink(Link):
7.214 + target = None
7.215 + width = None
7.216 + height = None
7.217 + align = ''
7.218 + thumb = False
7.219 +
7.220 + def isInline(self):
7.221 + return not bool(self.align or self.thumb)
7.222 +
7.223 + def _readArgs(self):
7.224 idx = 0
7.225 last = []
7.226
7.227 @@ -328,25 +413,8 @@
7.228
7.229 if not self.children:
7.230 self.children = last
7.231 -
7.232 -class ImageLink(Link):
7.233 - target = None
7.234 - width = None
7.235 - height = None
7.236 - align = ''
7.237 - thumb = False
7.238 -
7.239 - def isInline(self):
7.240 - return not bool(self.align or self.thumb)
7.241 -
7.242 -class LangLink(Link):
7.243 - pass
7.244
7.245 -class CategoryLink(Link):
7.246 - pass
7.247 -
7.248 -class SpecialLink(Link):
7.249 - pass
7.250 +Link._setSpecializeMap('default') # initialise the Link class
7.251
7.252
7.253 class Text(Node):
7.254 @@ -365,10 +433,10 @@
7.255 class Control(Text):
7.256 pass
7.257
7.258 -def _parseAtomFromString(s):
7.259 +def _parseAtomFromString(s, lang=None):
7.260 from mwlib import scanner
7.261 tokens = scanner.tokenize(s)
7.262 - p=Parser(tokens)
7.263 + p=Parser(tokens, lang=lang)
7.264 try:
7.265 return p.parseAtom()
7.266 except Exception, err:
7.267 @@ -377,10 +445,10 @@
7.268
7.269
7.270
7.271 -def parse_fields_in_imagemap(imap):
7.272 +def parse_fields_in_imagemap(imap, lang=None):
7.273
7.274 if imap.image:
7.275 - imap.imagelink = _parseAtomFromString(u'[['+imap.image+']]')
7.276 + imap.imagelink = _parseAtomFromString(u'[['+imap.image+']]', lang=lang)
7.277 if not isinstance(imap.imagelink, ImageLink):
7.278 imap.imagelink = None
7.279
7.280 @@ -397,13 +465,22 @@
7.281 _ALPHA_RE = re.compile(r'[^\W\d_]+', re.UNICODE) # Matches alpha strings
7.282
7.283 class Parser(object):
7.284 - def __init__(self, tokens, name=''):
7.285 + def __init__(self, tokens, name='', lang=None):
7.286 self.tokens = tokens
7.287 + self.lang = lang
7.288 self.pos = 0
7.289 self.name = name
7.290 self.lastpos = 0
7.291 self.count = 0
7.292 -
7.293 +
7.294 + if lang:
7.295 + nsMap = '%s+en_mw' % lang
7.296 + if nsMap not in namespace_maps:
7.297 + nsMap = 'default'
7.298 + else:
7.299 + nsMap = 'default'
7.300 + Link._setSpecializeMap(nsMap)
7.301 +
7.302 from mwlib import tagext
7.303 self.tagextensions = tagext.default_registry
7.304
7.305 @@ -548,7 +625,7 @@
7.306
7.307 if not obj.children and obj.target:
7.308 # [[a]] -> [[a|a]]
7.309 - obj.append(Text(obj.target))
7.310 + obj.append(Text(obj.full_target))
7.311
7.312 if isinstance(obj, ImageLink):
7.313 return obj
7.314 @@ -559,6 +636,8 @@
7.315 # [[a|a]]b -> [[a|ab]]
7.316 obj.append(Text(m.group(0)), True)
7.317 self.tokens[self.pos] = ('TEXT', self.token[1][m.end():])
7.318 +
7.319 + obj._normalizeTarget()
7.320
7.321 return obj
7.322
7.323 @@ -668,7 +747,7 @@
7.324 continue
7.325
7.326 # either image link or text inside
7.327 - n=_parseAtomFromString(u'[['+x+']]')
7.328 + n=_parseAtomFromString(u'[['+x+']]', lang=self.lang)
7.329
7.330 if isinstance(n, ImageLink):
7.331 children.append(n)
7.332 @@ -684,7 +763,7 @@
7.333 txt = "".join(x.caption for x in node.find(Text))
7.334 from mwlib import imgmap
7.335 node.imagemap = imgmap.ImageMapFromString(txt)
7.336 - parse_fields_in_imagemap(node.imagemap)
7.337 + parse_fields_in_imagemap(node.imagemap, lang=self.lang)
7.338
7.339 #print node.imagemap
7.340 return node
8.1 --- a/mwlib/uparser.py Thu Jul 03 17:11:35 2008 +0200
8.2 +++ b/mwlib/uparser.py Thu Jul 03 17:11:44 2008 +0200
8.3 @@ -76,7 +76,7 @@
8.4
8.5 postprocessors = [removeBoilerplate, simplify, fixlitags]
8.6
8.7 -def parseString(title=None, raw=None, wikidb=None, revision=None):
8.8 +def parseString(title=None, raw=None, wikidb=None, revision=None, lang=None):
8.9 """parse article with title from raw mediawiki text"""
8.10 assert title is not None
8.11
8.12 @@ -86,12 +86,16 @@
8.13 if wikidb:
8.14 te = expander.Expander(raw, pagename=title, wikidb=wikidb)
8.15 input = te.expandTemplates()
8.16 + if lang is None and hasattr(wikidb, 'getSource'):
8.17 + src = wikidb.getSource()
8.18 + if src:
8.19 + lang = src.get('language')
8.20 else:
8.21 input = raw
8.22 -
8.23 +
8.24 tokens = scanner.tokenize(input, title)
8.25
8.26 - a = parser.Parser(tokens, title).parse()
8.27 + a = parser.Parser(tokens, title, lang=lang).parse()
8.28 a.caption = title
8.29 for x in postprocessors:
8.30 x(a)
9.1 --- a/mwlib/wiki.py Thu Jul 03 17:11:35 2008 +0200
9.2 +++ b/mwlib/wiki.py Thu Jul 03 17:11:44 2008 +0200
9.3 @@ -135,8 +135,8 @@
9.4 def get_source(self):
9.5 if 'source' in self.metabook:
9.6 return self.metabook['source']
9.7 - if hasattr(self.wiki, 'getMetaData'):
9.8 - return self.wiki.getMetaData()
9.9 + if hasattr(self.wiki, 'getSource'):
9.10 + return self.wiki.getSource()
9.11 return metabook.make_source(
9.12 name=self.configparser.get('wiki', 'name'),
9.13 url=self.configparser.get('wiki', 'url'),
10.1 --- a/mwlib/zipwiki.py Thu Jul 03 17:11:35 2008 +0200
10.2 +++ b/mwlib/zipwiki.py Thu Jul 03 17:11:44 2008 +0200
10.3 @@ -35,6 +35,9 @@
10.4 except KeyError:
10.5 pass
10.6 return None
10.7 +
10.8 + def getSource(self):
10.9 + return self.metabook.get('source')
10.10
10.11 def getRawArticle(self, title, revision=None):
10.12 article = self._getArticle(title, revision=revision)
12.1 --- a/tests/test_parser.py Thu Jul 03 17:11:35 2008 +0200
12.2 +++ b/tests/test_parser.py Thu Jul 03 17:11:44 2008 +0200
12.3 @@ -610,3 +610,59 @@
12.4 assert u'<nosuchtag>' in txt, 'opening tag missing in asText()'
12.5 assert u'</nosuchtag>' in txt, 'closing tag missing in asText()'
12.6
12.7 +# Test varieties of link
12.8 +
12.9 +def test_plain_link():
12.10 + r=parse("[[bla]]").find(parser.ArticleLink)[0]
12.11 + assert r.target=='bla'
12.12 + assert r.children[0].caption == 'bla'
12.13 +
12.14 +def test_piped_link():
12.15 + r=parse("[[bla|blubb]]").find(parser.ArticleLink)[0]
12.16 + assert r.target=='bla'
12.17 + assert r.children[0].caption == 'blubb'
12.18 +
12.19 +def test_category_link():
12.20 + r=parse("[[category:bla]]").find(parser.CategoryLink)[0]
12.21 + assert r.target=='bla'
12.22 + assert r.namespace == 14
12.23 +
12.24 +def test_category_colon_link():
12.25 + r=parse("[[:category:bla]]").find(parser.SpecialLink)[0]
12.26 + assert r.target=='bla'
12.27 + assert r.namespace == 14
12.28 + assert not isinstance(r, parser.CategoryLink)
12.29 +
12.30 +def test_image_colon_link():
12.31 + r=parse("[[:image:bla.jpg]]").find(parser.SpecialLink)[0]
12.32 + assert r.target=='bla.jpg'
12.33 + assert r.namespace == 6
12.34 + assert not isinstance(r, parser.ImageLink)
12.35 +
12.36 +def test_interwiki_link():
12.37 + r=parse("[[wict:bla]]").find(parser.SpecialLink)[0]
12.38 + assert r.target=='bla'
12.39 + assert r.namespace == 'wiktionary'
12.40 +
12.41 +def test_language_link():
12.42 + r=parse("[[es:bla]]").find(parser.LangLink)[0]
12.43 + assert r.target=='bla'
12.44 + assert r.namespace == 'es'
12.45 +
12.46 +def test_long_language_link():
12.47 + r=parse("[[csb:bla]]").find(parser.LangLink)[0]
12.48 + assert r.target=='bla'
12.49 + assert r.namespace == 'csb'
12.50 +
12.51 +def test_normalize():
12.52 + r=parse("[[MediaWiki:__bla_ _]]").find(parser.LangLink)[0]
12.53 + assert r.target=='bla'
12.54 + assert r.namespace == 8
12.55 +
12.56 +def test_normalize_with_caps():
12.57 + parser.Link.capitalizeTarget = True
12.58 + r=parse("[[MediaWiki:__bla_ _ ]]").find(parser.LangLink)[0]
12.59 + parser.Link.capitalizeTarget = False
12.60 + assert r.target=='Bla'
12.61 + assert r.namespace == 8
12.62 + assert r.children[0].caption == 'MediaWiki:__bla_ _'
13.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
13.2 +++ b/mwlib/dumpparser.py Thu Jul 03 17:11:44 2008 +0200
13.3 @@ -0,0 +1,210 @@
13.4 +import os
13.5 +import re
13.6 +
13.7 +try:
13.8 + from xml.etree import cElementTree
13.9 +except ImportError:
13.10 + import cElementTree
13.11 +
13.12 +ns = '{http://www.mediawiki.org/xml/export-0.3/}'
13.13 +class Tags:
13.14 +
13.15 + # <namespaces><namespace> inside <siteinfo>
13.16 + namespace = ns + 'namespaces/' + ns + 'namespace'
13.17 +
13.18 + page = ns + 'page'
13.19 +
13.20 + # <title> inside <page>
13.21 + title = ns + 'title'
13.22 +
13.23 + # <revision> inside <page>
13.24 + revision = ns + 'revision'
13.25 +
13.26 + # <id> inside <revision>
13.27 + revid = ns + 'id'
13.28 +
13.29 + # <contributor><username> inside <revision>
13.30 + username = ns + 'contributor/' + ns + 'username'
13.31 +
13.32 + # <text> inside <revision>
13.33 + text = ns + 'text'
13.34 +
13.35 + # <timestamp> inside <revision>
13.36 + timestamp = ns + 'timestamp'
13.37 +
13.38 + # <revision><text> inside <page>
13.39 + revision_text = ns + 'revision/' + ns + 'text'
13.40 +
13.41 + siteinfo = ns + "siteinfo"
13.42 +
13.43 +NS_MEDIA = -2
13.44 +NS_SPECIAL = -1
13.45 +NS_MAIN = 0
13.46 +NS_TALK = 1
13.47 +NS_USER = 2
13.48 +NS_USER_TALK = 3
13.49 +NS_PROJECT = 4
13.50 +NS_PROJECT_TALK = 5
13.51 +NS_IMAGE = 6
13.52 +NS_IMAGE_TALK = 7
13.53 +NS_MEDIAWIKI = 8
13.54 +NS_MEDIAWIKI_TALK = 9
13.55 +NS_TEMPLATE = 10
13.56 +NS_TEMPLATE_TALK = 11
13.57 +NS_HELP = 12
13.58 +NS_HELP_TALK = 13
13.59 +NS_CATEGORY = 14
13.60 +NS_CATEGORY_TALK = 15
13.61 +
13.62 +class Page(object):
13.63 + __slots__ = [
13.64 + 'title', 'pageid', 'namespace_text',
13.65 + 'namespace',
13.66 + 'revid', 'timestamp',
13.67 + 'username', 'userid',
13.68 + 'minor', 'comment', 'text'
13.69 + ]
13.70 +
13.71 + def __init__(self):
13.72 + self.namespace_text = ''
13.73 + self.namespace = NS_MAIN
13.74 +
13.75 + redirect_rex = re.compile(r'^#Redirect:?\s*?\[\[(?P<redirect>.*?)\]\]', re.IGNORECASE)
13.76 +
13.77 + @property
13.78 + def redirect(self):
13.79 + mo = self.redirect_rex.search(self.text)
13.80 + if mo:
13.81 + return mo.group('redirect').split("|", 1)[0]
13.82 + return None
13.83 +
13.84 + def __repr__(self):
13.85 + text = repr(self.text[:50])
13.86 + redir = self.redirect
13.87 + if redir:
13.88 + text = "Redirect to %s" % repr(redir)
13.89 + return 'Page(%s (@%s): %s)' % (repr(self.title), self.timestamp, text)
13.90 +
13.91 +
13.92 +class DumpParser(object):
13.93 + namespaces = {
13.94 + 'template': NS_TEMPLATE,
13.95 + 'vorlage': NS_TEMPLATE,
13.96 + 'category': NS_CATEGORY,
13.97 + 'kategorie': NS_CATEGORY,
13.98 + 'image': NS_IMAGE,
13.99 + 'bild': NS_IMAGE,
13.100 + 'wikipedia': NS_PROJECT,
13.101 + }
13.102 +
13.103 + default_namespaces = [NS_MAIN, NS_TEMPLATE]
13.104 +
13.105 + tags = Tags()
13.106 +
13.107 + def __init__(self, xmlfilename,
13.108 + namespace_filter=default_namespaces,
13.109 + ignore_redirects=False):
13.110 + self.xmlfilename = xmlfilename
13.111 + self.namespace_filter = namespace_filter
13.112 + self.ignore_redirects = ignore_redirects
13.113 +
13.114 + def openInputStream(self):
13.115 + if self.xmlfilename.lower().endswith(".bz2"):
13.116 + f = os.popen("bunzip2 -c %s" % self.xmlfilename, "r")
13.117 + elif self.xmlfilename.lower().endswith(".7z"):
13.118 + f = os.popen("7z -so x %s" % self.xmlfilename, "r")
13.119 + else:
13.120 + f = open(self.xmlfilename, "r")
13.121 +
13.122 + return f
13.123 +
13.124 + @staticmethod
13.125 + def getTag(elem):
13.126 + # rough is good enough
13.127 + return elem.tag[elem.tag.rindex('}')+1:]
13.128 +
13.129 + def handleSiteinfo(self, siteinfo):
13.130 + for nsElem in siteinfo.findall(self.tags.namespace):
13.131 + try:
13.132 + self.namespaces[nsElem.text.lower()] = int(nsElem.get('key'))
13.133 + except AttributeError:
13.134 + # text is probably None
13.135 + pass
13.136 +
13.137 + def __iter__(self):
13.138 + f = self.openInputStream()
13.139 +
13.140 + elemIter = (el for evt, el in cElementTree.iterparse(f))
13.141 + for elem in elemIter:
13.142 + if self.getTag(elem) == 'page':
13.143 + page = self.handlePageElement(elem)
13.144 + if page:
13.145 + yield page
13.146 + elem.clear()
13.147 + elif self.getTag(elem) == 'siteinfo':
13.148 + self.handleSiteinfo(elem)
13.149 + elem.clear()
13.150 +
13.151 + f.close()
13.152 +
13.153 + def handlePageElement(self, pageElem):
13.154 + res = Page()
13.155 + lastRevision = None
13.156 + for el in pageElem:
13.157 + tag = self.getTag(el)
13.158 + if tag == 'title':