From 54b4425e81f8b2f989bd7383d7a5de0a9263a9f8 Mon Sep 17 00:00:00 2001 From: Kingfisher <53167312+SmallKingfisher@users.noreply.github.com> Date: Fri, 2 Feb 2024 09:36:28 +0100 Subject: [PATCH 1/3] Bug fixes in ActionParse.py Fixed two bugs in the original code. Opening the XML files without specifying the encoding will throw an error. The other bug is a Type Error (population must be a sequence). --- wikiraider/actions/ActionParse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wikiraider/actions/ActionParse.py b/wikiraider/actions/ActionParse.py index 2b2d36d..80c0264 100644 --- a/wikiraider/actions/ActionParse.py +++ b/wikiraider/actions/ActionParse.py @@ -90,7 +90,7 @@ def run(self): colorlog.getLogger().info('In the meantime the consumers are already processing the pages...') for xml_file in xml_files: - for event, element in xml.etree.cElementTree.iterparse(open(xml_file, 'r')): + for event, element in xml.etree.cElementTree.iterparse(open(xml_file, 'r', encoding='utf-8', errors='ignore')): if element.tag.endswith('page'): title = element.find('.//{http://www.mediawiki.org/xml/export-0.10/}title') revision = element.find('.//{http://www.mediawiki.org/xml/export-0.10/}revision') @@ -121,7 +121,7 @@ def on_finish(self): colorlog.getLogger().info('Found a total of {} word(s).'.format(len(self.queue.results))) if len(self.queue.results) >= 10: - colorlog.getLogger().info('Here are 10 of them {}.'.format(random.sample(self.queue.results, 10))) + colorlog.getLogger().info('Here are 10 of them {}.'.format(random.sample(list(self.queue.results), 10))) colorlog.getLogger().info('Writing all words to a file...') WriterHelper.write_to_txt(self.get_wiki_name(), self.queue.results) From 8428da364c9d464eeb86614697b32b5fb5df266e Mon Sep 17 00:00:00 2001 From: Kingfisher <53167312+SmallKingfisher@users.noreply.github.com> Date: Fri, 2 Feb 2024 09:47:57 +0100 Subject: [PATCH 2/3] Update requirements.txt Update modules to newer versions. --- requirements.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index b908fc7..5c4c2c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -requests==2.23.0 # Make HTTP requests -beautifulsoup4==4.8.2 # Parse HTML responses -colorlog==4.1.0 # Extensive console logging -tqdm==4.43.0 # Progressbar for downloads \ No newline at end of file +requests==2.31.0 # Make HTTP requests +beautifulsoup4==4.12.3 # Parse HTML responses +colorlog==6.8.2 # Extensive console logging +tqdm==4.66.1 # Progressbar for downloads +setuptools==69.0.3 From 052bc87ab680d3ea78251731d9ae40aebb8d21f8 Mon Sep 17 00:00:00 2001 From: Guido Kroon Date: Mon, 9 Sep 2024 12:16:56 +0200 Subject: [PATCH 3/3] fix parser --- wikiraider/actions/ActionParse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wikiraider/actions/ActionParse.py b/wikiraider/actions/ActionParse.py index 80c0264..2f370ad 100644 --- a/wikiraider/actions/ActionParse.py +++ b/wikiraider/actions/ActionParse.py @@ -92,9 +92,9 @@ def run(self): for xml_file in xml_files: for event, element in xml.etree.cElementTree.iterparse(open(xml_file, 'r', encoding='utf-8', errors='ignore')): if element.tag.endswith('page'): - title = element.find('.//{http://www.mediawiki.org/xml/export-0.10/}title') - revision = element.find('.//{http://www.mediawiki.org/xml/export-0.10/}revision') - text = revision.find('.//{http://www.mediawiki.org/xml/export-0.10/}text') + title = element.find('.//{http://www.mediawiki.org/xml/export-0.11/}title') + revision = element.find('.//{http://www.mediawiki.org/xml/export-0.11/}revision') + text = revision.find('.//{http://www.mediawiki.org/xml/export-0.11/}text') element.clear()