Wikipedia provides a massive dump containing all edits on all articles. It's about 150gb and takes about a week to download. The file is http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-meta-history.xml.bz.
Clearly, to parse such a large file, you can't use a DOM API. You must use something like SAX. There is a Python library to parse this file and shove it into a database, but I actually don't want it in a database. Here's some code to parse the data, or at least the parts I care about:
Updated! Fixed the fact that the characters method must apply its own buffering. Fixed an encoding issue.
Clearly, to parse such a large file, you can't use a DOM API. You must use something like SAX. There is a Python library to parse this file and shove it into a database, but I actually don't want it in a database. Here's some code to parse the data, or at least the parts I care about:
Updated! Fixed the fact that the characters method must apply its own buffering. Fixed an encoding issue.
#!/usr/bin/env pythonI created enwiki-latest-pages-meta-history.test.xml as a short snippet of the XML just so I could do some testing:
"""Parse the enwiki-latest-pages-meta-history.xml file."""
from __future__ import with_statement
from contextlib import closing
from StringIO import StringIO
from optparse import OptionParser
import sys
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
from blueplate.parsing.tsv import create_default_writer
__docformat__ = "restructuredtext"
class WPXMLHandler(ContentHandler):
"""Parse the enwiki-latest-pages-meta-history.xml file.
This parser looks for just the things we're interested in. It maintains a
tag stack because the XML format actually does have some depth and context
does actually matter.
"""
def __init__(self, page_handler):
"""Do some setup.
page_handler
This is a callback. It will be a called with a page in the form
of a dict such as::
{'id': u'8',
'revisions': [{'timestamp': u'2001-01-20T15:01:12Z',
'user': u'ip:pD950754B.dip.t-dialin.net'},
{'timestamp': u'2002-02-25T15:43:11Z',
'user': u'ip:Conversion script'},
{'timestamp': u'2006-09-08T04:16:46Z',
'user': u'username:Rory096'},
{'timestamp': u'2007-05-24T14:41:48Z',
'user': u'username:Ngaiklin'},
{'timestamp': u'2007-05-25T17:12:09Z',
'user': u'username:Gurch'}],
'title': u'AppliedEthics'}
"""
self._tag_stack = []
self._page_handler = page_handler
def _try_calling(self, method_name, *args):
"""Try calling the method with the given method_name.
If it doesn't exist, just return.
Note, I don't want to accept **kargs because:
a) I don't need them yet.
b) They're really expensive, and this function is going to get called
a lot.
Let's not think of it as permature optimization, let's think of it as
avoiding premature flexibility ;)
"""
try:
f = getattr(self, method_name)
except AttributeError:
pass
else:
return f(*args)
def startElement(self, name, attr):
"""Dispatch to methods like _start_tagname."""
self._tag_stack.append(name)
self._try_calling('_start_' + name, attr)
self._setup_characters()
def _start_page(self, attr):
self._page = dict(revisions=[])
def _start_revision(self, attr):
self._page['revisions'].append({})
def endElement(self, name):
"""Dispatch to methods like _end_tagname."""
self._teardown_characters()
self._try_calling('_end_' + name)
self._tag_stack.pop()
def _end_page(self):
self._page_handler(self._page)
def _setup_characters(self):
"""Setup the callbacks to receive character data.
The Parser will call the "characters" method to report each chunk of
character data. SAX parsers may return all contiguous character data
in a single chunk, or they may split it into several chunks. Hence,
this class has to take care of some buffering.
"""
method_name = '_characters_' + '_'.join(self._tag_stack)
if hasattr(self, method_name):
self._characters_buf = StringIO()
else:
self._characters_buf = None
def characters(self, s):
"""Buffer the given characters."""
if self._characters_buf is not None:
self._characters_buf.write(s)
def _teardown_characters(self):
"""Now that we have the entire string, put it where it needs to go.
Dispatch to methods like _characters_some_stack_of_tags. Drop strings
that are just whitespace.
"""
if self._characters_buf is None:
return
s = self._characters_buf.getvalue()
if s.strip() == '':
return
method_name = '_characters_' + '_'.join(self._tag_stack)
self._try_calling(method_name, s)
def _characters_mediawiki_page_title(self, s):
self._page['title'] = s
def _characters_mediawiki_page_id(self, s):
self._page['id'] = s
def _characters_mediawiki_page_revision_timestamp(self, s):
self._page['revisions'][-1]['timestamp'] = s
def _characters_mediawiki_page_revision_contributor_username(self, s):
self._page['revisions'][-1]['user'] = 'username:' + s
def _characters_mediawiki_page_revision_contributor_ip(self, s):
self._page['revisions'][-1]['user'] = 'ip:' + s
def parsewpxml(file, page_handler):
"""Call WPXMLHandler.
file
This is the name of the file to parse.
page_handler
See WPXMLHandler.__init__.
"""
parser = make_parser()
wpxmlhandler = WPXMLHandler(page_handler)
parser.setContentHandler(wpxmlhandler)
parser.parse(file)
def main(argv=None, # Defaults to sys.argv.
input=sys.stdin, _open=open):
"""Run the application.
The arguments are really there for dependency injection.
"""
def page_handler(page):
"""Write the right bits to the right files."""
try:
atoms_writer.writerow((page['id'], page['title']))
for rev in page['revisions']:
if not 'user' in rev:
continue
triplets_writer.writerow(
(rev['user'], rev['timestamp'], page['id']))
except Exception, e:
print >> sys.stderr, "%s: %s\n%s" % (parser.get_prog_name(),
e, page)
global parser
parser = OptionParser()
parser.add_option('--atoms', dest='atoms',
help="store atom ids and names in this file",
metavar='FILE.tsv')
parser.add_option('--user-timestamp-atom-triplets',
dest='user_timestamp_atom_triplets',
help="store (user, timestamp, atom) triplets in this file",
metavar='FILE.tsv')
(options, args) = parser.parse_args(args=argv)
if args:
parser.error("No arguments expected")
for required in ('atoms', 'user_timestamp_atom_triplets'):
if not getattr(options, required):
parser.error('The %s parameter is required' % required)
LINE_BUFFERED = 1
with closing(_open(options.atoms, 'w', LINE_BUFFERED)) as atoms_file:
with closing(_open(options.user_timestamp_atom_triplets,
'w', LINE_BUFFERED)) as triplets_file:
atoms_writer = create_default_writer(atoms_file)
triplets_writer = create_default_writer(triplets_file)
parsewpxml(input, page_handler)
if __name__ == '__main__':
main()
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/exporHere's my test code:
t-0.3.xsd" version="0.3" xml:lang="en">
<siteinfo>
<sitename>Wikipedia</sitename>
<base>http://en.wikipedia.org/wiki/Main_Page</base>
<generator>MediaWiki 1.13alpha</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2">Media</namespace>
<namespace key="-1">Special</namespace>
<namespace key="0" />
<namespace key="1">Talk</namespace>
<namespace key="2">User</namespace>
<namespace key="3">User talk</namespace>
<namespace key="4">Wikipedia</namespace>
<namespace key="5">Wikipedia talk</namespace>
<namespace key="6">Image</namespace>
<namespace key="7">Image talk</namespace>
<namespace key="8">MediaWiki</namespace>
<namespace key="9">MediaWiki talk</namespace>
<namespace key="10">Template</namespace>
<namespace key="11">Template talk</namespace>
<namespace key="12">Help</namespace>
<namespace key="13">Help talk</namespace>
<namespace key="14">Category</namespace>
<namespace key="15">Category talk</namespace>
<namespace key="100">Portal</namespace>
<namespace key="101">Portal talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>AppliedEthics</title>
<id>8</id>
<revision>
<id>233189</id>
<timestamp>2001-01-20T15:01:12Z</timestamp>
<contributor>
<ip>pD950754B.dip.t-dialin.net</ip>
</contributor>
<minor />
<comment>*</comment>
<text xml:space="preserve">Something the Marketing Dept. will never fully understand.
</text>
</revision>
<revision>
<id>15898943</id>
<timestamp>2002-02-25T15:43:11Z</timestamp>
<contributor>
<ip>Conversion script</ip>
</contributor>
<minor />
<comment>Automated conversion</comment>
<text xml:space="preserve">#REDIRECT [[Applied ethics]]
</text>
</revision>
<revision>
<id>74466767</id>
<timestamp>2006-09-08T04:16:46Z</timestamp>
<contributor>
<username>Rory096</username>
<id>750223</id>
</contributor>
<comment>cat rd</comment>
<text xml:space="preserve">#REDIRECT [[Applied ethics]] {{R from CamelCase}}</text>
</revision>
<revision>
<id>133180238</id>
<timestamp>2007-05-24T14:41:48Z</timestamp>
<contributor>
<username>FunnyCharé</username>
<id>4477979</id>
</contributor>
<minor />
<comment>Robot: Automated text replacement (-\[\[(.*?[\:|\|])*?(.+?)\]\] +\g<2>)</comment>
<text xml:space="preserve">#REDIRECT Applied ethics {{R from CamelCase}}</text>
</revision>
<revision>
<id>133452279</id>
<timestamp>2007-05-25T17:12:09Z</timestamp>
<contributor>
<username>Gurch</username>
<id>241822</id>
</contributor>
<minor />
<comment>Revert edit(s) by [[Special:Contributions/FunnyCharé|FunnyCharé]] to last version by [[Special:Contributions/Rory096|Rory096]]</comment>
<text xml:space="preserve">#REDIRECT [[Applied ethics]] {{R from CamelCase}}</text>
</revision>
</page>
<page>
<title>AccessibleComputing</title>
<id>10</id>
<revision>
<id>233192</id>
<timestamp>2001-01-21T02:12:21Z</timestamp>
<contributor>
<username>RoseParks</username>
<id>99</id>
</contributor>
<comment>*</comment>
<text xml:space="preserve">This subject covers
* AssistiveTechnology
* AccessibleSoftware
* AccessibleWeb
* LegalIssuesInAccessibleComputing
</text>
</revision>
<revision>
<id>862220</id>
<timestamp>2002-02-25T15:43:11Z</timestamp>
<contributor>
<ip>Conversion script</ip>
</contributor>
<minor />
<comment>Automated conversion</comment>
<text xml:space="preserve">#REDIRECT [[Accessible Computing]]
</text>
</revision>
<revision>
<id>15898945</id>
<timestamp>2003-04-25T22:18:38Z</timestamp>
<contributor>
<username>Ams80</username>
<id>7543</id>
</contributor>
<minor />
<comment>Fixing redirect</comment>
<text xml:space="preserve">#REDIRECT [[Accessible_computing]]</text>
</revision>
<revision>
<id>56681914</id>
<timestamp>2006-06-03T16:55:41Z</timestamp>
<contributor>
<username>Nzd</username>
<id>516514</id>
</contributor>
<minor />
<comment>fix double redirect</comment>
<text xml:space="preserve">#REDIRECT [[Computer accessibility]]</text>
</revision>
<revision>
<id>74466685</id>
<timestamp>2006-09-08T04:16:04Z</timestamp>
<contributor>
<username>Rory096</username>
<id>750223</id>
</contributor>
<comment>cat rd</comment>
<text xml:space="preserve">#REDIRECT [[Computer accessibility]] {{R from CamelCase}}</text>
</revision>
<revision>
<id>133180268</id>
<timestamp>2007-05-24T14:41:58Z</timestamp>
<contributor>
<username>FunnyCharé</username>
<id>4477979</id>
</contributor>
<minor />
<comment>Robot: Automated text replacement (-\[\[(.*?[\:|\|])*?(.+?)\]\] +\g<2>)</comment>
<text xml:space="preserve">#REDIRECT Computer accessibility {{R from CamelCase}}</text>
</revision>
<revision>
<id>133452289</id>
<timestamp>2007-05-25T17:12:12Z</timestamp>
<contributor>
<username>Gurch</username>
<id>241822</id>
</contributor>
<minor />
<comment>Revert edit(s) by [[Special:Contributions/FunnyCharé|FunnyCharé]] to last version by [[Special:Contributions/Rory096|Rory096]]</comment>
<text xml:space="preserve">#REDIRECT [[Computer accessibility]] {{R from CamelCase}}</text>
</revision>
<!-- For some reason, I encountered an edit with no IP or username. -->
<revision>
<id>9339391</id>
<timestamp>2005-01-13T17:22:17Z</timestamp>
<contributor>
<ip />
</contributor>
<text xml:space="preserve">blah, blah, blah</text>
</revision>
</page>
</mediawiki>
"""Test the parsewpxml module."""
from StringIO import StringIO # cStringIO won't work here.
import os
from nose.tools import assert_true, assert_equal
from projects.wp import parsewpxml
XML_FILE = os.path.join(os.path.dirname(__file__),
'enwiki-latest-pages-meta-history.test.xml')
__docformat__ = "restructuredtext"
def test_xml_file_exists():
assert_true(os.path.exists(XML_FILE))
def test_parsewpxml():
def page_handler(page):
page_list.append(page)
expected = \
[{'id': u'8',
'revisions': [{'timestamp': u'2001-01-20T15:01:12Z',
'user': u'ip:pD950754B.dip.t-dialin.net'},
{'timestamp': u'2002-02-25T15:43:11Z',
'user': u'ip:Conversion script'},
{'timestamp': u'2006-09-08T04:16:46Z',
'user': u'username:Rory096'},
{'timestamp': u'2007-05-24T14:41:48Z',
'user': u'username:FunnyChar\xe9'},
{'timestamp': u'2007-05-25T17:12:09Z',
'user': u'username:Gurch'}],
'title': u'AppliedEthics'},
{'id': u'10',
'revisions': [{'timestamp': u'2001-01-21T02:12:21Z',
'user': u'username:RoseParks'},
{'timestamp': u'2002-02-25T15:43:11Z',
'user': u'ip:Conversion script'},
{'timestamp': u'2003-04-25T22:18:38Z',
'user': u'username:Ams80'},
{'timestamp': u'2006-06-03T16:55:41Z',
'user': u'username:Nzd'},
{'timestamp': u'2006-09-08T04:16:04Z',
'user': u'username:Rory096'},
{'timestamp': u'2007-05-24T14:41:58Z',
'user': u'username:FunnyChar\xe9'},
{'timestamp': u'2007-05-25T17:12:12Z',
'user': u'username:Gurch'},
{'timestamp': u'2005-01-13T17:22:17Z'}],
'title': u'AccessibleComputing'}]
page_list = []
parsewpxml.parsewpxml(XML_FILE, page_handler)
assert_equal(page_list, expected)
def test_main():
"""Testing the main method involves a fair bit of dependency injection."""
class UnclosableStringIO(StringIO):
"""This is a StringIO that ignores the close method."""
def close(self):
pass
def _open(name, *args):
"""Return StringIO() buffers instead of real open file handles."""
if name == 'atoms.tsv':
return atoms_file
elif name == 'triplets.tsv':
return triplets_file
else:
raise ValueError
atoms_file = UnclosableStringIO()
triplets_file = UnclosableStringIO()
parsewpxml.main(
argv=['--atoms=atoms.tsv',
'--user-timestamp-atom-triplet=triplets.tsv'],
input=open(XML_FILE),
_open=_open)
expected_atoms = """\
8\tAppliedEthics
10\tAccessibleComputing
"""
expected_triplets = """\
ip:pD950754B.dip.t-dialin.net\t2001-01-20T15:01:12Z\t8
ip:Conversion script\t2002-02-25T15:43:11Z\t8
username:Rory096\t2006-09-08T04:16:46Z\t8
username:FunnyChar\xc3\xa9\t2007-05-24T14:41:48Z\t8
username:Gurch\t2007-05-25T17:12:09Z\t8
username:RoseParks\t2001-01-21T02:12:21Z\t10
ip:Conversion script\t2002-02-25T15:43:11Z\t10
username:Ams80\t2003-04-25T22:18:38Z\t10
username:Nzd\t2006-06-03T16:55:41Z\t10
username:Rory096\t2006-09-08T04:16:04Z\t10
username:FunnyChar\xc3\xa9\t2007-05-24T14:41:58Z\t10
username:Gurch\t2007-05-25T17:12:12Z\t10
"""
assert_equal(expected_atoms, atoms_file.getvalue())
assert_equal(expected_triplets, triplets_file.getvalue())
Comments
Also, my code has a bug in it. The characters() method doesn't receive the whole string between tags; I thought it did. It gets a few characters at a time. Some code needs to be added to buffer the characters. I plan on doing that within the next couple weeks or so. Otherwise you lose the first half of some strings. Sorry about that.
from blueplate.parsing.tsv import create_default_writer
I need a tool to retrieve all the revisions of all the articles or articles belong to particular category.
Thanks,
I'm not sure if they can parse the wikidumps that include all revisions.