Automated merge with http://code.pediapress.com/hg/mwlib
authorheiko@brainbot.com
Thu Jul 03 17:11:44 2008 +0200 (4 months ago)
changeset 1127f18354b68a9a
parent 1126a0e399347e61
parent 112560428f1271d7
child 1128a5f5d8cf71a4
Automated merge with http://code.pediapress.com/hg/mwlib
       1 --- a/MANIFEST.in	Thu Jul 03 17:11:35 2008 +0200
       2 +++ b/MANIFEST.in	Thu Jul 03 17:11:44 2008 +0200
       3 @@ -63,6 +63,7 @@
       4  include mwlib/serve.py
       5  include mwlib/snippets.py
       6  include mwlib/snippets.txt
       7 +include mwlib/tagext.py
       8  include mwlib/texmap.py
       9  include mwlib/timeline.py
      10  include mwlib/uparser.py
      11 @@ -97,6 +98,7 @@
      12  include tests/test_sanitychecker.py
      13  include tests/test_scanner.py
      14  include tests/test_table.py
      15 +include tests/test_tagext.py
      16  include tests/test_timeline.py
      17  include tests/test_utils.py
      18  include tests/test_xhtmlwriter.py
     1.1 --- a/docs/metabook.txt	Thu Jul 03 17:11:35 2008 +0200
     1.2 +++ b/docs/metabook.txt	Thu Jul 03 17:11:44 2008 +0200
     1.3 @@ -73,6 +73,10 @@
     1.4  name (string):
     1.5  
     1.6      Unique name of source, e.g. "Wikipedia (en)"
     1.7 +
     1.8 +language (string)
     1.9 +
    1.10 +    2-character ISO code of language, e.g. "en"
    1.11  
    1.12  
    1.13  License
     2.1 --- a/mwlib/cdb.py	Thu Jul 03 17:11:35 2008 +0200
     2.2 +++ b/mwlib/cdb.py	Thu Jul 03 17:11:44 2008 +0200
     2.3 @@ -48,7 +48,7 @@
     2.4      def close(self):
     2.5          self.map.close()
     2.6  
     2.7 -    def __iter__(self, fn=None):
     2.8 +    def __iter__(self):
     2.9          len = 2048
    2.10          while len < self.eod:
    2.11              klen, vlen = struct.unpack("<LL", self.map[len:len+8])
    2.12 @@ -57,37 +57,25 @@
    2.13              len += klen
    2.14              val = self.map[len:len+vlen]
    2.15              len += vlen
    2.16 -            if fn:
    2.17 -                yield fn(key, val)
    2.18 -            else:
    2.19 -                yield (key, val)
    2.20 +            yield (key, val)
    2.21  
    2.22      def iteritems(self):
    2.23          return self.__iter__()
    2.24  
    2.25      def iterkeys(self):
    2.26 -        return self.__iter__(lambda k,v: k)
    2.27 +        return (k for k, v in self)
    2.28  
    2.29      def itervalues(self):
    2.30 -        return self.__iter__(lambda k,v: v)
    2.31 +        return (v for k, v in self)
    2.32  
    2.33      def items(self):
    2.34 -        ret = []
    2.35 -        for i in self.iteritems():
    2.36 -            ret.append(i)
    2.37 -        return ret
    2.38 +        return list(self.iteritems())
    2.39  
    2.40      def keys(self):
    2.41 -        ret = []
    2.42 -        for i in self.iterkeys():
    2.43 -            ret.append(i)
    2.44 -        return ret
    2.45 +        return list(self.iterkeys())
    2.46  
    2.47      def values(self):
    2.48 -        ret = []
    2.49 -        for i in self.itervalues():
    2.50 -            ret.append(i)
    2.51 -        return ret
    2.52 +        return list(self.itervalues())
    2.53  
    2.54      def findstart(self):
    2.55          self.loop = 0
     3.1 --- a/mwlib/cdbwiki.py	Thu Jul 03 17:11:35 2008 +0200
     3.2 +++ b/mwlib/cdbwiki.py	Thu Jul 03 17:11:44 2008 +0200
     3.3 @@ -8,206 +8,129 @@
     3.4  import zlib
     3.5  import re
     3.6  
     3.7 -from mwlib import cdb
     3.8 -
     3.9 -try:
    3.10 -    from xml.etree import cElementTree
    3.11 -except ImportError:
    3.12 -    import cElementTree
    3.13 -
    3.14 -ns = '{http://www.mediawiki.org/xml/export-0.3/}'
    3.15 -
    3.16 -wikiindex = "wikiidx"
    3.17 -wikidata = "wikidata.bin"
    3.18 -
    3.19 -
    3.20 +from mwlib import cdb, dumpparser
    3.21  
    3.22  def normname(name):
    3.23      name = name.strip().replace("_", " ")
    3.24      name = name[:1].upper()+name[1:]
    3.25      return name
    3.26  
    3.27 -class Tags:
    3.28 -    page = ns + 'page'
    3.29 +class ZCdbWriter(cdb.CdbMake):
    3.30 +    def __init__(self, indexpath, datapath=None):
    3.31 +        if not datapath:
    3.32 +            datapath = indexpath + 'data.bin'
    3.33 +            indexpath = indexpath + 'idx.cdb'
    3.34  
    3.35 -    # <title> inside <page>
    3.36 -    title = ns + 'title'
    3.37 +        cdb.CdbMake.__init__(self, open(indexpath, 'wb'))
    3.38 +        self.data = open(datapath, 'wb')
    3.39  
    3.40 -    # <revision> inside <page>
    3.41 -    revision = ns + 'revision'
    3.42 +    def add(self, key, val):
    3.43 +        key = key.encode("utf-8")
    3.44 +        val = zlib.compress(val.encode('utf-8')) # NOTE: encode wasn't in original
    3.45 +        pos = self.data.tell()
    3.46 +        self.data.write(val)
    3.47 +        cdb.CdbMake.add(self, key, "%s %s" % (pos, len(val)))
    3.48  
    3.49 -    # <id> inside <revision>
    3.50 -    revid = ns + 'id'
    3.51 +    def finish(self):
    3.52 +        cdb.CdbMake.finish(self)
    3.53 +        self.data.close()
    3.54  
    3.55 -    # <contributor><username> inside <revision>
    3.56 -    username = ns + 'contributor/' + ns + 'username'
    3.57  
    3.58 -    # <text> inside <revision>
    3.59 -    text = ns + 'text'
    3.60 +class ZCdbReader(cdb.Cdb):
    3.61 +    def __init__(self, indexpath, datapath=None):
    3.62 +        if not datapath:
    3.63 +            datapath = indexpath + 'data.bin'
    3.64 +            indexpath = indexpath + 'idx.cdb'
    3.65  
    3.66 -    # <timestamp> inside <revision>
    3.67 -    timestamp = ns + 'timestamp'
    3.68 +        cdb.Cdb.__init__(self, open(indexpath, 'rb'))
    3.69 +        self.datapath = datapath
    3.70  
    3.71 -    # <revision><text> inside <page>
    3.72 -    revision_text = ns + 'revision/' + ns + 'text'
    3.73 +    def __getitem__(self, key):
    3.74 +        key = key.encode("utf-8")
    3.75 +        data = cdb.Cdb.__getitem__(self, key) # may raise KeyError 
    3.76 +        return self._readz(data)
    3.77  
    3.78 -    siteinfo = ns + "siteinfo"
    3.79 +    def _readz(self, data):
    3.80 +        pos, len = map(int, data.split())
    3.81 +        
    3.82 +        f=open(self.datapath, "rb")
    3.83 +        f.seek(pos)
    3.84 +        d=f.read(len)
    3.85 +        f.close()
    3.86 +        return zlib.decompress(d).decode('utf-8')
    3.87  
    3.88 -class DumpParser(object):
    3.89 -    category_ns = set(['category', 'kategorie'])
    3.90 -    image_ns = set(['image', 'bild'])
    3.91 -    template_ns = set(['template', 'vorlage'])
    3.92 -    wikipedia_ns = set(['wikipedia'])
    3.93 +    def iterkeys(self):
    3.94 +        return (k.decode('utf-8') for k in cdb.Cdb.iterkeys(self))
    3.95  
    3.96 -    tags = Tags()
    3.97 +    def iteritems(self):
    3.98 +        return ((k.decode('utf-8'), self._readz(v))
    3.99 +            for k,v in cdb.Cdb.iteritems(self))
   3.100  
   3.101 +    def itervalues(self):
   3.102 +        return (self._readz(v) for v in cdb.Cdb.itervalues(self))
   3.103  
   3.104 -    def __init__(self, xmlfilename):
   3.105 -        self.xmlfilename = xmlfilename
   3.106  
   3.107 -    def _write(self, msg):
   3.108 -        sys.stdout.write(msg)
   3.109 -        sys.stdout.flush()
   3.110 -
   3.111 -    def openInputStream(self):
   3.112 -        if self.xmlfilename.lower().endswith(".bz2"):
   3.113 -            f = os.popen("bunzip2 -c %s" % self.xmlfilename, "r")
   3.114 -        elif self.xmlfilename.lower().endswith(".7z"):
   3.115 -            f = os.popen("7z -so x %s" % self.xmlfilename, "r")
   3.116 +class BuildWiki():
   3.117 +    def __init__(self, dumpfile, outputdir, prefix='wiki'):
   3.118 +        if type(dumpfile) in (type(''), type(u'')):
   3.119 +            self.dumpParser = dumpparser.DumpParser(dumpfile)
   3.120          else:
   3.121 -            f = open(self.xmlfilename, "r")        
   3.122 -
   3.123 -        return f
   3.124 -
   3.125 -    def __call__(self):
   3.126 -        f = self.openInputStream()    
   3.127 -        
   3.128 -        count = 0
   3.129 -        for event, elem in cElementTree.iterparse(f):
   3.130 -            if elem.tag != self.tags.page:
   3.131 -                continue
   3.132 -            self.handlePageElement(elem)
   3.133 -            elem.clear()
   3.134 -            count += 1
   3.135 -            
   3.136 -            if count % 5000 == 0:
   3.137 -                self._write(" %s\n" % count)            
   3.138 -            elif count % 100 == 0:
   3.139 -                self._write(".")
   3.140 -
   3.141 -    
   3.142 -    def handlePageElement(self, page):
   3.143 -        title = page.find(self.tags.title).text
   3.144 -        revisions = page.findall(self.tags.revision)
   3.145 -        if not revisions:
   3.146 -            return
   3.147 -        revision = revisions[-1]
   3.148 -        
   3.149 -        texttag = revision.find(self.tags.text)
   3.150 -        timestamptag = revision.find(self.tags.timestamp)
   3.151 -        revision.clear()
   3.152 -        
   3.153 -        if texttag is not None:
   3.154 -            text = texttag.text
   3.155 -            texttag.clear()
   3.156 -        else:
   3.157 -            text = None
   3.158 -            
   3.159 -        if timestamptag is not None:
   3.160 -            timestamp = timestamptag.text
   3.161 -            timestamptag.clear()
   3.162 -        else:
   3.163 -            timestamp = None
   3.164 -        
   3.165 -        if not text:
   3.166 -            return
   3.167 -
   3.168 -        if isinstance(title, str):
   3.169 -            title = unicode(title)
   3.170 -        if isinstance(text, str):
   3.171 -            text = unicode(text)
   3.172 -
   3.173 -            
   3.174 -        if ':' in title:
   3.175 -            ns, rest = title.split(':', 1)
   3.176 -            ns = ns.lower()
   3.177 -            if ns not in self.template_ns:
   3.178 -                return
   3.179 -            self.handleTemplate(rest, text, timestamp)
   3.180 -        else:
   3.181 -            self.handleArticle(title, text, timestamp)
   3.182 -
   3.183 -    def handleArticle(self, title, text, timestamp):
   3.184 -        print "ART:", repr(title), len(text), timestamp
   3.185 -
   3.186 -    def handleTemplate(self, title, text, timestamp):
   3.187 -        print "TEMPL:", repr(title), len(text), timestamp
   3.188 -
   3.189 -class BuildWiki(DumpParser):
   3.190 -    def __init__(self, xmlfilename, outputdir):
   3.191 -        DumpParser.__init__(self, xmlfilename)
   3.192 +            self.dumpParser = dumpfile
   3.193 +        self.output_path = os.path.join(outputdir, prefix)
   3.194          self.outputdir = outputdir
   3.195          
   3.196      def __call__(self):
   3.197          if not os.path.exists(self.outputdir):
   3.198              os.makedirs(self.outputdir)
   3.199          
   3.200 -        n = os.path.join(self.outputdir, wikiindex)
   3.201 -        out = open(os.path.join(self.outputdir, wikidata), "wb")
   3.202 -        self.out = out
   3.203 -        f = open(n+'.cdb', 'wb')
   3.204 -        c = cdb.CdbMake(f)
   3.205 -        self.cdb = c
   3.206 +        self.writer = ZCdbWriter(self.output_path)
   3.207  
   3.208 -        DumpParser.__call__(self)
   3.209 -        c.finish()
   3.210 -        f.close()
   3.211 +        count = 0
   3.212 +        for page in self.dumpParser:
   3.213 +            if page.namespace == dumpparser.NS_MAIN:
   3.214 +                self.handleArticle(page.title, page.text, page.timestamp)
   3.215 +            elif page.namespace == dumpparser.NS_TEMPLATE:
   3.216 +                self.handleTemplate(page.title, page.text, page.timestamp)
   3.217 +            else:
   3.218 +                self.handleOther(page.title, page.text, page.timestamp)
   3.219  
   3.220 +            count += 1
   3.221 +            if count % 5000 == 0:
   3.222 +                self._write(" %s\n" % count)
   3.223 +            elif count % 100 == 0:
   3.224 +                self._write(".")
   3.225 +            
   3.226 +        self.writer.finish()
   3.227  
   3.228 -    def _writeobj(self, key, val):
   3.229 -        key = key.encode("utf-8")
   3.230 -        val = zlib.compress(val)
   3.231 -        pos = self.out.tell()
   3.232 -        self.out.write(val)
   3.233 -        self.cdb.add(key, "%s %s" % (pos, len(val)))
   3.234 +    def _write(self, msg):
   3.235 +        sys.stdout.write(msg)
   3.236 +        sys.stdout.flush()
   3.237  
   3.238      def handleArticle(self, title, text, timestamp):
   3.239 -        self._writeobj(u":"+title, text.encode("utf-8"))
   3.240 +        self.writer.add(u":"+title, text)
   3.241  
   3.242      def handleTemplate(self, title, text, timestamp):
   3.243 -        self._writeobj(u"T:"+title, text.encode("utf-8"))
   3.244 +        self.writer.add(u"T:"+title, text)
   3.245 +
   3.246 +    def handleOther(self, title, text, timestamp):
   3.247 +        self.writer.add(title, text)
   3.248      
   3.249  
   3.250  
   3.251  class WikiDB(object):
   3.252      redirect_rex = re.compile(r'^#Redirect:?\s*?\[\[(?P<redirect>.*?)\]\]', re.IGNORECASE)
   3.253  
   3.254 -    def __init__(self, dir):
   3.255 +    def __init__(self, dir, prefix='wiki'):
   3.256          self.dir = dir
   3.257 -        self.obj2pos_path = os.path.join(self.dir, wikidata)
   3.258 -        self.cdb = cdb.Cdb(open(os.path.join(self.dir, wikiindex+'.cdb'), 'rb'))
   3.259 -
   3.260 -    def _readobj(self, key):
   3.261 -        key = key.encode("utf-8")
   3.262 -
   3.263 -        try:
   3.264 -            data = self.cdb[key]  
   3.265 -        except KeyError:
   3.266 -            return None
   3.267 -
   3.268 -        pos, len = map(int, data.split())
   3.269 -        
   3.270 -        f=open(self.obj2pos_path, "rb")
   3.271 -        f.seek(pos)
   3.272 -        d=f.read(len)
   3.273 -        f.close()
   3.274 -        return zlib.decompress(d)
   3.275 +        self.reader = ZCdbReader(os.path.join(self.dir, prefix))
   3.276  
   3.277      def getRawArticle(self, title, raw=None, revision=None):
   3.278          title = normname(title)
   3.279 -        res = self._readobj(":"+title)
   3.280 -        if res is None:
   3.281 -            return  None
   3.282 +        print repr(title)
   3.283 +        try:
   3.284 +            res = self.reader[":"+title]
   3.285 +        except KeyError:
   3.286 +            return None
   3.287  
   3.288          res = unicode(res, 'utf-8')
   3.289          mo = self.redirect_rex.search(res)
   3.290 @@ -224,9 +147,10 @@
   3.291              title = title.split(':', 1)[1]
   3.292  
   3.293          title = normname(title)
   3.294 -        res = unicode(self._readobj(u"T:"+title) or "", 'utf-8')
   3.295 -        if not res:
   3.296 -            return res
   3.297 +        try:
   3.298 +            res = self.reader["T:"+title]
   3.299 +        except KeyError:
   3.300 +            return ''
   3.301  
   3.302          mo = self.redirect_rex.search(res)
   3.303          if mo:
   3.304 @@ -237,7 +161,12 @@
   3.305  
   3.306  
   3.307      def articles(self):
   3.308 -        for k, v in self.cdb:
   3.309 -            if k[0]==':':
   3.310 -                k = unicode(k[1:], "utf-8")
   3.311 -                yield k
   3.312 +        return (k[1:]
   3.313 +                for k in self.reader.iterkeys()
   3.314 +                if k[0] == ':')
   3.315 +
   3.316 +    def article_texts(self):
   3.317 +        return ((k[1:], v)
   3.318 +                for k in self.reader.iteritems()
   3.319 +                if k[0] == ':')
   3.320 +        
     4.1 --- a/mwlib/metabook.py	Thu Jul 03 17:11:35 2008 +0200
     4.2 +++ b/mwlib/metabook.py	Thu Jul 03 17:11:44 2008 +0200
     4.3 @@ -20,7 +20,7 @@
     4.4          metabook['subtitle'] = subtitle
     4.5      return metabook
     4.6  
     4.7 -def make_source(name=None, url=None):
     4.8 +def make_source(name=None, url=None, language=None):
     4.9      source = {
    4.10          'type': 'source',
    4.11          'system': 'MediaWiki',
    4.12 @@ -29,6 +29,8 @@
    4.13          source['name'] = name
    4.14      if url:
    4.15          source['url'] = url
    4.16 +    if language:
    4.17 +        source['language'] = language
    4.18      return source
    4.19  
    4.20  def make_article(title=None, displaytitle=None, content_type='text/x-wiki'):
     5.1 --- a/mwlib/mwapidb.py	Thu Jul 03 17:11:35 2008 +0200
     5.2 +++ b/mwlib/mwapidb.py	Thu Jul 03 17:11:44 2008 +0200
     5.3 @@ -416,6 +416,7 @@
     5.4          self.template_blacklist = []
     5.5          if template_blacklist is not None:
     5.6              self.setTemplateBlacklist(template_blacklist)
     5.7 +        self.source = None
     5.8      
     5.9      def setTemplateBlacklist(self, template_blacklist):
    5.10          raw = self.getRawArticle(template_blacklist)
    5.11 @@ -525,14 +526,18 @@
    5.12          except KeyError:
    5.13              return None
    5.14      
    5.15 -    def getMetaData(self):
    5.16 +    def getSource(self):
    5.17 +        if self.source is not None:
    5.18 +            return self.source
    5.19          result = self.api_helper.query(meta='siteinfo')
    5.20          try:
    5.21              g = result['general']
    5.22 -            return metabook.make_source(
    5.23 +            self.source = metabook.make_source(
    5.24                  url=g['base'],
    5.25                  name='%s (%s)' % (g['sitename'], g['lang']),
    5.26 +                language=g['lang'],
    5.27              )
    5.28 +            return self.source
    5.29          except KeyError:
    5.30              return None
    5.31      
     6.1 --- a/mwlib/options.py	Thu Jul 03 17:11:35 2008 +0200
     6.2 +++ b/mwlib/options.py	Thu Jul 03 17:11:44 2008 +0200
     6.3 @@ -79,5 +79,6 @@
     6.4                      self.options.collectionpage,
     6.5                  ))
     6.6              self.metabook = metabook.parse_collection_page(wikitext)
     6.7 +            env.metabook = self.metabook
     6.8          return env
     6.9      
     7.1 --- a/mwlib/parser.py	Thu Jul 03 17:11:35 2008 +0200
     7.2 +++ b/mwlib/parser.py	Thu Jul 03 17:11:44 2008 +0200
     7.3 @@ -9,6 +9,8 @@
     7.4  
     7.5  from mwlib.scanner import tokenize, TagToken, EndTagToken
     7.6  from mwlib.log import Log
     7.7 +from mwlib.namespace import namespace_maps, interwiki_map
     7.8 +from mwlib.lang import languages
     7.9  
    7.10  log = Log("parser")
    7.11  
    7.12 @@ -193,82 +195,165 @@
    7.13  
    7.14  class Link(Node):
    7.15      target = None
    7.16 -    specialPrefixes = set(["wikipedia", "wiktionary", "wikibooks", "wikisource",
    7.17 -                           "wikiquote", "meta", "talk",
    7.18 -                           "commons", "wikinews", "template", "wikitravel", "help", "vorlage"])
    7.19 -    from mwlib.lang import languages
    7.20 +    from mwlib.namespace import NS_MAIN, NS_CATEGORY, NS_IMAGE
    7.21 +
    7.22      colon = False
    7.23  
    7.24      def hasContent(self):
    7.25          if self.target:
    7.26              return True
    7.27          return False
    7.28 +
    7.29 +    @classmethod
    7.30 +    def _buildSpecializeMap(cls, namespaces, interwikis, langs):
    7.31 +        """
    7.32 +        Returns a dict mapping namespace prefixes to a tuple of form
    7.33 +        (link_class, namespace_value).
    7.34 +        """
    7.35 +        res = {}
    7.36 +        for name, num in namespaces.iteritems():
    7.37 +            name = name.lower()
    7.38 +            if num == cls.NS_CATEGORY:
    7.39 +                res[name] = (CategoryLink, num)
    7.40 +            elif num == cls.NS_IMAGE:
    7.41 +                res[name] = (ImageLink, num)
    7.42 +            else:
    7.43 +                res[name] = (NamespaceLink, num)
    7.44 +
    7.45 +        for name, target in interwikis.iteritems():
    7.46 +            res[name.lower()] = (InterwikiLink, target)
    7.47 +
    7.48 +        for lang in langs:
    7.49 +            res[lang.lower()] = (LangLink, lang)
    7.50 +
    7.51 +        return res
    7.52          
    7.53 +    @classmethod
    7.54 +    def _setSpecializeMap(cls, nsMap='default'):
    7.55 +        cls._specializeMap = cls._buildSpecializeMap(
    7.56 +            namespace_maps[nsMap], interwiki_map, languages)
    7.57 +
    7.58      def _specialize(self):
    7.59 +        """
    7.60 +        Handles different forms of link, e.g.:
    7.61 +            - [[Foo]]
    7.62 +            - [[Foo|Bar]]
    7.63 +            - [[Category:Foo]]
    7.64 +            - [[:Category:Foo]]
    7.65 +        """
    7.66 +
    7.67          if not self.children:
    7.68              return
    7.69  
    7.70          if type(self.children[0]) != Text:
    7.71              return
    7.72              
    7.73 -        self.target = target = self.children[0].caption.strip()
    7.74 +        # Handle [[Foo|Bar]]
    7.75 +        full_target = self.children[0].caption.strip()
    7.76          del self.children[0]
    7.77          if self.children and self.children[0] == Control("|"):
    7.78              del self.children[0]
    7.79 +
    7.80 +        # Mark [[:Category:Foo]]. See below
    7.81 +        if full_target.startswith(':'):
    7.82 +            self.colon = True
    7.83 +            full_target = full_target[1:]
    7.84 +        self.full_target = full_target
    7.85          
    7.86 -        pic = self.target
    7.87 -        if pic.startswith(':'):
    7.88 -            self.colon = True
    7.89 -            
    7.90 -        
    7.91 -        
    7.92 -        # pic == "Bild:Wappen_von_Budenheim.png"
    7.93 -        
    7.94 -        pic = pic.strip(': ')
    7.95 -        if ':' not in pic:
    7.96 -            return
    7.97 -            
    7.98 -        linktype, pic = pic.split(':', 1)
    7.99 -        linktype = linktype.lower().strip(" :")
   7.100 -        
   7.101 -        if linktype in ("category", "kategorie"):
   7.102 -            self.__class__ = CategoryLink
   7.103 -            self.target = pic.strip()
   7.104 +        try:
   7.105 +            ns, title = full_target.split(':', 1)
   7.106 +        except ValueError:
   7.107 +            self.namespace = self.NS_MAIN
   7.108 +            self.target = full_target
   7.109 +            self.__class__ = ArticleLink
   7.110              return
   7.111  
   7.112 -        if linktype in self.specialPrefixes:
   7.113 -            self.__class__ = SpecialLink
   7.114 -            self.target = pic.strip()
   7.115 -            self.ns = linktype            
   7.116 +        (self.__class__, self.namespace) = (
   7.117 +                self._specializeMap.get(ns.lower(), (ArticleLink, self.NS_MAIN)))
   7.118  
   7.119 +        if len(ns) == 2:
   7.120 +            # Assume this is an unlisted language
   7.121 +            self.__class__ = LangLink
   7.122 +            self.namespace = ns.lower()
   7.123 +
   7.124 +        if self.colon and self.namespace != self.NS_MAIN:
   7.125 +            # [[:Category:Foo]] should not be a category link
   7.126 +            self.__class__ = NamespaceLink
   7.127 +
   7.128 +        if self.namespace == self.NS_MAIN:
   7.129 +            # e.g. [[Blah: Foo]] is an ordinary article with a colon
   7.130 +            self.target = full_target
   7.131 +        else:
   7.132 +            self.target = title
   7.133 +
   7.134 +        if self.__class__ == ImageLink:
   7.135 +            # Handle images. First ensure they are syntactically sound.
   7.136 +
   7.137 +            try:
   7.138 +                prefix, suffix = title.rsplit('.', 1)
   7.139 +                if suffix.lower() in ['jpg', 'jpeg', 'gif', 'png', 'svg']:
   7.140 +                    self._readArgs() # calls Image._readArgs()
   7.141 +                    return
   7.142 +            except ValueError:
   7.143 +                pass
   7.144 +            # We can't handle this as an image, so default:
   7.145 +            self.__class__ = NamespaceLink 
   7.146 +    
   7.147 +
   7.148 +    capitalizeTarget = False # Wiki-dependent setting, e.g. Wikipedia => True
   7.149 +
   7.150 +    _SPACE_RE = re.compile('[_\s]+')
   7.151 +    def _normalizeTarget(self):
   7.152 +        """
   7.153 +        Normalizes the format of the target with regards to whitespace and
   7.154 +        capitalization (depending on capitalizeTarget setting).
   7.155 +        """
   7.156 +
   7.157 +        if not self.target:
   7.158              return
   7.159  
   7.160 -        if linktype in self.languages:
   7.161 -            self.__class__ = LangLink
   7.162 -            return
   7.163 -            
   7.164 -        
   7.165 -        if linktype not in ("bild", "image", "imagen"):
   7.166 -            # assume a LangLink
   7.167 -            log.info("Unknown linktype:", repr(linktype))
   7.168 -            if len(linktype)==2:
   7.169 -                self.__class__ = LangLink
   7.170 -            return
   7.171 -        
   7.172 -        
   7.173 -        # pic == "Wappen_von_Budenheim.png"
   7.174 -        
   7.175 -        try:
   7.176 -            prefix, suffix = pic.rsplit('.', 1)
   7.177 -        except ValueError:
   7.178 -            return
   7.179 +        # really we should have a urllib.unquote() first, but in practice this
   7.180 +        # format may be rare enough to ignore
   7.181  
   7.182 -        if suffix.lower() in ['jpg', 'jpeg', 'gif', 'png', 'svg']:
   7.183 -            self.__class__ = ImageLink
   7.184 -            self.target = pic.strip()
   7.185 +        # [[__init__]] -> [[init]]
   7.186 +        self.target = self._SPACE_RE.sub(' ', self.target).strip()
   7.187 +        if self.capitalizeTarget:
   7.188 +            self.target = self.target[:1].upper() + self.target[1:]
   7.189  
   7.190  
   7.191 +# Link forms:
   7.192  
   7.193 +class ArticleLink(Link):
   7.194 +    pass
   7.195 +
   7.196 +class SpecialLink(Link):
   7.197 +    pass
   7.198 +
   7.199 +class NamespaceLink(SpecialLink):
   7.200 +    pass
   7.201 +
   7.202 +class InterwikiLink(SpecialLink):
   7.203 +    pass
   7.204 +
   7.205 +# Non-links with same syntax:
   7.206 +
   7.207 +class LangLink(Link):
   7.208 +    pass
   7.209 +
   7.210 +class CategoryLink(Link):
   7.211 +    pass
   7.212 +
   7.213 +class ImageLink(Link):
   7.214 +    target = None
   7.215 +    width = None
   7.216 +    height = None
   7.217 +    align = ''
   7.218 +    thumb = False
   7.219 +    
   7.220 +    def isInline(self):
   7.221 +        return not bool(self.align or self.thumb)
   7.222 +
   7.223 +    def _readArgs(self):
   7.224          idx = 0
   7.225          last = []
   7.226          
   7.227 @@ -328,25 +413,8 @@
   7.228          
   7.229          if not self.children:
   7.230              self.children = last
   7.231 -            
   7.232 -class ImageLink(Link):
   7.233 -    target = None
   7.234 -    width = None
   7.235 -    height = None
   7.236 -    align = ''
   7.237 -    thumb = False
   7.238 -    
   7.239 -    def isInline(self):
   7.240 -        return not bool(self.align or self.thumb)
   7.241 -    
   7.242 -class LangLink(Link):
   7.243 -    pass
   7.244  
   7.245 -class CategoryLink(Link):
   7.246 -    pass
   7.247 -
   7.248 -class SpecialLink(Link):
   7.249 -    pass
   7.250 +Link._setSpecializeMap('default') # initialise the Link class
   7.251  
   7.252              
   7.253  class Text(Node):
   7.254 @@ -365,10 +433,10 @@
   7.255  class Control(Text):
   7.256      pass
   7.257  
   7.258 -def _parseAtomFromString(s):
   7.259 +def _parseAtomFromString(s, lang=None):
   7.260      from mwlib import scanner
   7.261      tokens = scanner.tokenize(s)
   7.262 -    p=Parser(tokens)
   7.263 +    p=Parser(tokens, lang=lang)
   7.264      try:
   7.265          return p.parseAtom()
   7.266      except Exception, err:
   7.267 @@ -377,10 +445,10 @@
   7.268  
   7.269                    
   7.270      
   7.271 -def parse_fields_in_imagemap(imap):
   7.272 +def parse_fields_in_imagemap(imap, lang=None):
   7.273      
   7.274      if imap.image:
   7.275 -        imap.imagelink = _parseAtomFromString(u'[['+imap.image+']]')
   7.276 +        imap.imagelink = _parseAtomFromString(u'[['+imap.image+']]', lang=lang)
   7.277          if not isinstance(imap.imagelink, ImageLink):
   7.278              imap.imagelink = None
   7.279  
   7.280 @@ -397,13 +465,22 @@
   7.281  _ALPHA_RE = re.compile(r'[^\W\d_]+', re.UNICODE) # Matches alpha strings
   7.282              
   7.283  class Parser(object):
   7.284 -    def __init__(self, tokens, name=''):
   7.285 +    def __init__(self, tokens, name='', lang=None):
   7.286          self.tokens = tokens
   7.287 +        self.lang = lang
   7.288          self.pos = 0
   7.289          self.name = name
   7.290          self.lastpos = 0
   7.291          self.count = 0
   7.292 -
   7.293 +        
   7.294 +        if lang:
   7.295 +            nsMap = '%s+en_mw' % lang
   7.296 +            if nsMap not in namespace_maps:
   7.297 +                nsMap = 'default'
   7.298 +        else:
   7.299 +            nsMap = 'default'
   7.300 +        Link._setSpecializeMap(nsMap)
   7.301 +        
   7.302          from mwlib import tagext
   7.303          self.tagextensions = tagext.default_registry
   7.304          
   7.305 @@ -548,7 +625,7 @@
   7.306  
   7.307          if not obj.children and obj.target:
   7.308              # [[a]] -> [[a|a]]
   7.309 -            obj.append(Text(obj.target))
   7.310 +            obj.append(Text(obj.full_target))
   7.311  
   7.312          if isinstance(obj, ImageLink):
   7.313              return obj
   7.314 @@ -559,6 +636,8 @@
   7.315                  # [[a|a]]b -> [[a|ab]]
   7.316                  obj.append(Text(m.group(0)), True)
   7.317                  self.tokens[self.pos] = ('TEXT', self.token[1][m.end():])
   7.318 +
   7.319 +        obj._normalizeTarget()
   7.320              
   7.321          return obj
   7.322      
   7.323 @@ -668,7 +747,7 @@
   7.324                  continue
   7.325  
   7.326              # either image link or text inside
   7.327 -            n=_parseAtomFromString(u'[['+x+']]')
   7.328 +            n=_parseAtomFromString(u'[['+x+']]', lang=self.lang)
   7.329  
   7.330              if isinstance(n, ImageLink):
   7.331                  children.append(n)
   7.332 @@ -684,7 +763,7 @@
   7.333          txt = "".join(x.caption for x in node.find(Text))
   7.334          from mwlib import imgmap
   7.335          node.imagemap = imgmap.ImageMapFromString(txt)
   7.336 -        parse_fields_in_imagemap(node.imagemap)
   7.337 +        parse_fields_in_imagemap(node.imagemap, lang=self.lang)
   7.338  
   7.339          #print node.imagemap
   7.340          return node
     8.1 --- a/mwlib/uparser.py	Thu Jul 03 17:11:35 2008 +0200
     8.2 +++ b/mwlib/uparser.py	Thu Jul 03 17:11:44 2008 +0200
     8.3 @@ -76,7 +76,7 @@
     8.4  
     8.5  postprocessors = [removeBoilerplate, simplify, fixlitags]
     8.6  
     8.7 -def parseString(title=None, raw=None, wikidb=None, revision=None):
     8.8 +def parseString(title=None, raw=None, wikidb=None, revision=None, lang=None):
     8.9      """parse article with title from raw mediawiki text"""
    8.10      assert title is not None 
    8.11  
    8.12 @@ -86,12 +86,16 @@
    8.13      if wikidb:
    8.14          te = expander.Expander(raw, pagename=title, wikidb=wikidb)
    8.15          input = te.expandTemplates()
    8.16 +        if lang is None and hasattr(wikidb, 'getSource'):
    8.17 +            src = wikidb.getSource()
    8.18 +            if src:
    8.19 +                lang = src.get('language')
    8.20      else:
    8.21          input = raw
    8.22 -
    8.23 +    
    8.24      tokens = scanner.tokenize(input, title)
    8.25  
    8.26 -    a = parser.Parser(tokens, title).parse()
    8.27 +    a = parser.Parser(tokens, title, lang=lang).parse()
    8.28      a.caption = title
    8.29      for x in postprocessors:
    8.30          x(a)
     9.1 --- a/mwlib/wiki.py	Thu Jul 03 17:11:35 2008 +0200
     9.2 +++ b/mwlib/wiki.py	Thu Jul 03 17:11:44 2008 +0200
     9.3 @@ -135,8 +135,8 @@
     9.4      def get_source(self):
     9.5          if 'source' in self.metabook:
     9.6              return self.metabook['source']
     9.7 -        if hasattr(self.wiki, 'getMetaData'):
     9.8 -            return self.wiki.getMetaData()
     9.9 +        if hasattr(self.wiki, 'getSource'):
    9.10 +            return self.wiki.getSource()
    9.11          return metabook.make_source(
    9.12              name=self.configparser.get('wiki', 'name'),
    9.13              url=self.configparser.get('wiki', 'url'),
    10.1 --- a/mwlib/zipwiki.py	Thu Jul 03 17:11:35 2008 +0200
    10.2 +++ b/mwlib/zipwiki.py	Thu Jul 03 17:11:44 2008 +0200
    10.3 @@ -35,6 +35,9 @@
    10.4          except KeyError:
    10.5              pass
    10.6          return None
    10.7 +    
    10.8 +    def getSource(self):
    10.9 +        return self.metabook.get('source')
   10.10      
   10.11      def getRawArticle(self, title, revision=None):
   10.12          article = self._getArticle(title, revision=revision)
    12.1 --- a/tests/test_parser.py	Thu Jul 03 17:11:35 2008 +0200
    12.2 +++ b/tests/test_parser.py	Thu Jul 03 17:11:44 2008 +0200
    12.3 @@ -610,3 +610,59 @@
    12.4      assert u'<nosuchtag>' in txt, 'opening tag missing in asText()'
    12.5      assert u'</nosuchtag>' in txt, 'closing tag missing in asText()'
    12.6      
    12.7 +# Test varieties of link
    12.8 +
    12.9 +def test_plain_link():
   12.10 +    r=parse("[[bla]]").find(parser.ArticleLink)[0]
   12.11 +    assert r.target=='bla'
   12.12 +    assert r.children[0].caption == 'bla'
   12.13 +
   12.14 +def test_piped_link():
   12.15 +    r=parse("[[bla|blubb]]").find(parser.ArticleLink)[0]
   12.16 +    assert r.target=='bla'
   12.17 +    assert r.children[0].caption == 'blubb'
   12.18 +
   12.19 +def test_category_link():
   12.20 +    r=parse("[[category:bla]]").find(parser.CategoryLink)[0]
   12.21 +    assert r.target=='bla'
   12.22 +    assert r.namespace == 14
   12.23 +
   12.24 +def test_category_colon_link():
   12.25 +    r=parse("[[:category:bla]]").find(parser.SpecialLink)[0]
   12.26 +    assert r.target=='bla'
   12.27 +    assert r.namespace == 14
   12.28 +    assert not isinstance(r, parser.CategoryLink)
   12.29 +
   12.30 +def test_image_colon_link():
   12.31 +    r=parse("[[:image:bla.jpg]]").find(parser.SpecialLink)[0]
   12.32 +    assert r.target=='bla.jpg'
   12.33 +    assert r.namespace == 6
   12.34 +    assert not isinstance(r, parser.ImageLink)
   12.35 +
   12.36 +def test_interwiki_link():
   12.37 +    r=parse("[[wict:bla]]").find(parser.SpecialLink)[0]
   12.38 +    assert r.target=='bla'
   12.39 +    assert r.namespace == 'wiktionary'
   12.40 +
   12.41 +def test_language_link():
   12.42 +    r=parse("[[es:bla]]").find(parser.LangLink)[0]
   12.43 +    assert r.target=='bla'
   12.44 +    assert r.namespace == 'es'
   12.45 +
   12.46 +def test_long_language_link():
   12.47 +    r=parse("[[csb:bla]]").find(parser.LangLink)[0]
   12.48 +    assert r.target=='bla'
   12.49 +    assert r.namespace == 'csb'
   12.50 +
   12.51 +def test_normalize():
   12.52 +    r=parse("[[MediaWiki:__bla_ _]]").find(parser.LangLink)[0]
   12.53 +    assert r.target=='bla'
   12.54 +    assert r.namespace == 8
   12.55 +
   12.56 +def test_normalize_with_caps():
   12.57 +    parser.Link.capitalizeTarget = True
   12.58 +    r=parse("[[MediaWiki:__bla_ _ ]]").find(parser.LangLink)[0]
   12.59 +    parser.Link.capitalizeTarget = False
   12.60 +    assert r.target=='Bla'
   12.61 +    assert r.namespace == 8
   12.62 +    assert r.children[0].caption == 'MediaWiki:__bla_ _'
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/mwlib/dumpparser.py	Thu Jul 03 17:11:44 2008 +0200
    13.3 @@ -0,0 +1,210 @@
    13.4 +import os
    13.5 +import re
    13.6 +
    13.7 +try:
    13.8 +    from xml.etree import cElementTree
    13.9 +except ImportError:
   13.10 +    import cElementTree
   13.11 +
   13.12 +ns = '{http://www.mediawiki.org/xml/export-0.3/}'
   13.13 +class Tags:
   13.14 +
   13.15 +    # <namespaces><namespace> inside <siteinfo>
   13.16 +    namespace = ns + 'namespaces/' + ns + 'namespace'
   13.17 +
   13.18 +    page = ns + 'page'
   13.19 +
   13.20 +    # <title> inside <page>
   13.21 +    title = ns + 'title'
   13.22 +
   13.23 +    # <revision> inside <page>
   13.24 +    revision = ns + 'revision'
   13.25 +
   13.26 +    # <id> inside <revision>
   13.27 +    revid = ns + 'id'
   13.28 +
   13.29 +    # <contributor><username> inside <revision>
   13.30 +    username = ns + 'contributor/' + ns + 'username'
   13.31 +
   13.32 +    # <text> inside <revision>
   13.33 +    text = ns + 'text'
   13.34 +
   13.35 +    # <timestamp> inside <revision>
   13.36 +    timestamp = ns + 'timestamp'
   13.37 +
   13.38 +    # <revision><text> inside <page>
   13.39 +    revision_text = ns + 'revision/' + ns + 'text'
   13.40 +
   13.41 +    siteinfo = ns + "siteinfo"
   13.42 +
   13.43 +NS_MEDIA          = -2
   13.44 +NS_SPECIAL        = -1
   13.45 +NS_MAIN           =  0
   13.46 +NS_TALK           =  1
   13.47 +NS_USER           =  2
   13.48 +NS_USER_TALK      =  3
   13.49 +NS_PROJECT        =  4
   13.50 +NS_PROJECT_TALK   =  5
   13.51 +NS_IMAGE          =  6
   13.52 +NS_IMAGE_TALK     =  7
   13.53 +NS_MEDIAWIKI      =  8
   13.54 +NS_MEDIAWIKI_TALK =  9
   13.55 +NS_TEMPLATE       = 10
   13.56 +NS_TEMPLATE_TALK  = 11
   13.57 +NS_HELP           = 12
   13.58 +NS_HELP_TALK      = 13
   13.59 +NS_CATEGORY       = 14
   13.60 +NS_CATEGORY_TALK  = 15
   13.61 +
   13.62 +class Page(object):
   13.63 +    __slots__ = [
   13.64 +        'title', 'pageid', 'namespace_text',
   13.65 +        'namespace',
   13.66 +        'revid', 'timestamp',
   13.67 +        'username', 'userid',
   13.68 +        'minor', 'comment', 'text'
   13.69 +    ]
   13.70 +
   13.71 +    def __init__(self):
   13.72 +        self.namespace_text = ''
   13.73 +        self.namespace = NS_MAIN
   13.74 +
   13.75 +    redirect_rex = re.compile(r'^#Redirect:?\s*?\[\[(?P<redirect>.*?)\]\]', re.IGNORECASE)
   13.76 +
   13.77 +    @property
   13.78 +    def redirect(self):
   13.79 +        mo = self.redirect_rex.search(self.text)
   13.80 +        if mo:
   13.81 +            return mo.group('redirect').split("|", 1)[0]
   13.82 +        return None
   13.83 +
   13.84 +    def __repr__(self):
   13.85 +        text = repr(self.text[:50])
   13.86 +        redir = self.redirect
   13.87 +        if redir:
   13.88 +            text = "Redirect to %s" % repr(redir)
   13.89 +        return 'Page(%s (@%s): %s)' % (repr(self.title), self.timestamp, text)
   13.90 +
   13.91 +
   13.92 +class DumpParser(object):
   13.93 +    namespaces = {
   13.94 +        'template': NS_TEMPLATE,
   13.95 +        'vorlage': NS_TEMPLATE,
   13.96 +        'category': NS_CATEGORY,
   13.97 +        'kategorie': NS_CATEGORY,
   13.98 +        'image': NS_IMAGE,
   13.99 +        'bild': NS_IMAGE,
  13.100 +        'wikipedia': NS_PROJECT,
  13.101 +    }
  13.102 +
  13.103 +    default_namespaces = [NS_MAIN, NS_TEMPLATE]
  13.104 +
  13.105 +    tags = Tags()
  13.106 +
  13.107 +    def __init__(self, xmlfilename,
  13.108 +                 namespace_filter=default_namespaces,
  13.109 +                 ignore_redirects=False):
  13.110 +        self.xmlfilename = xmlfilename
  13.111 +        self.namespace_filter = namespace_filter
  13.112 +        self.ignore_redirects = ignore_redirects
  13.113 +
  13.114 +    def openInputStream(self):
  13.115 +        if self.xmlfilename.lower().endswith(".bz2"):
  13.116 +            f = os.popen("bunzip2 -c %s" % self.xmlfilename, "r")
  13.117 +        elif self.xmlfilename.lower().endswith(".7z"):
  13.118 +            f = os.popen("7z -so x %s" % self.xmlfilename, "r")
  13.119 +        else:
  13.120 +            f = open(self.xmlfilename, "r")        
  13.121 +
  13.122 +        return f
  13.123 +
  13.124 +    @staticmethod
  13.125 +    def getTag(elem):
  13.126 +        # rough is good enough
  13.127 +        return elem.tag[elem.tag.rindex('}')+1:]
  13.128 +
  13.129 +    def handleSiteinfo(self, siteinfo):
  13.130 +        for nsElem in siteinfo.findall(self.tags.namespace):
  13.131 +            try:
  13.132 +                self.namespaces[nsElem.text.lower()] = int(nsElem.get('key'))
  13.133 +            except AttributeError:
  13.134 +                # text is probably None
  13.135 +                pass
  13.136 +        
  13.137 +    def __iter__(self):
  13.138 +        f = self.openInputStream()    
  13.139 +        
  13.140 +        elemIter = (el for evt, el in cElementTree.iterparse(f))
  13.141 +        for elem in elemIter:
  13.142 +            if self.getTag(elem) == 'page':
  13.143 +                page = self.handlePageElement(elem)
  13.144 +                if page:
  13.145 +                    yield page
  13.146 +                elem.clear()
  13.147 +            elif self.getTag(elem) == 'siteinfo':
  13.148 +                self.handleSiteinfo(elem)
  13.149 +                elem.clear()
  13.150 +        
  13.151 +        f.close()
  13.152 +    
  13.153 +    def handlePageElement(self, pageElem):
  13.154 +        res = Page()
  13.155 +        lastRevision = None
  13.156 +        for el in pageElem:
  13.157 +            tag = self.getTag(el)
  13.158 +            if tag == 'title':