From 6fb30a31a1256fea4027e20a888ea3f67c41281b Mon Sep 17 00:00:00 2001 From: "Andre D." <152297+glynx@users.noreply.github.com> Date: Thu, 21 Nov 2024 20:18:02 +0100 Subject: [PATCH] Dynamic Default Namespace in ActionParse New dumps seem to use a new revision of the namespace. It seems to be unwise to hard code the namespace including the version --- wikiraider/actions/ActionParse.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/wikiraider/actions/ActionParse.py b/wikiraider/actions/ActionParse.py index 2b2d36d..7682387 100644 --- a/wikiraider/actions/ActionParse.py +++ b/wikiraider/actions/ActionParse.py @@ -89,12 +89,16 @@ def run(self): colorlog.getLogger().info('Iterating all of the XML files and pushing pages to queue. This might take a while...') colorlog.getLogger().info('In the meantime the consumers are already processing the pages...') + namespaces = {'': None} for xml_file in xml_files: for event, element in xml.etree.cElementTree.iterparse(open(xml_file, 'r')): + if element.tag.startswith('{'): + # extract last used namespace dynamically and set it as default + namespaces[''] = element.tag.split('}')[0][1:] if element.tag.endswith('page'): - title = element.find('.//{http://www.mediawiki.org/xml/export-0.10/}title') - revision = element.find('.//{http://www.mediawiki.org/xml/export-0.10/}revision') - text = revision.find('.//{http://www.mediawiki.org/xml/export-0.10/}text') + title = element.find('.//title', namespaces) + revision = element.find('.//revision', namespaces) + text = revision.find('.//text', namespaces) element.clear()