diff --git a/requirements.txt b/requirements.txt index b908fc7..5c4c2c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -requests==2.23.0 # Make HTTP requests -beautifulsoup4==4.8.2 # Parse HTML responses -colorlog==4.1.0 # Extensive console logging -tqdm==4.43.0 # Progressbar for downloads \ No newline at end of file +requests==2.31.0 # Make HTTP requests +beautifulsoup4==4.12.3 # Parse HTML responses +colorlog==6.8.2 # Extensive console logging +tqdm==4.66.1 # Progressbar for downloads +setuptools==69.0.3 diff --git a/wikiraider/actions/ActionParse.py b/wikiraider/actions/ActionParse.py index 2b2d36d..2f370ad 100644 --- a/wikiraider/actions/ActionParse.py +++ b/wikiraider/actions/ActionParse.py @@ -90,11 +90,11 @@ def run(self): colorlog.getLogger().info('In the meantime the consumers are already processing the pages...') for xml_file in xml_files: - for event, element in xml.etree.cElementTree.iterparse(open(xml_file, 'r')): + for event, element in xml.etree.cElementTree.iterparse(open(xml_file, 'r', encoding='utf-8', errors='ignore')): if element.tag.endswith('page'): - title = element.find('.//{http://www.mediawiki.org/xml/export-0.10/}title') - revision = element.find('.//{http://www.mediawiki.org/xml/export-0.10/}revision') - text = revision.find('.//{http://www.mediawiki.org/xml/export-0.10/}text') + title = element.find('.//{http://www.mediawiki.org/xml/export-0.11/}title') + revision = element.find('.//{http://www.mediawiki.org/xml/export-0.11/}revision') + text = revision.find('.//{http://www.mediawiki.org/xml/export-0.11/}text') element.clear() @@ -121,7 +121,7 @@ def on_finish(self): colorlog.getLogger().info('Found a total of {} word(s).'.format(len(self.queue.results))) if len(self.queue.results) >= 10: - colorlog.getLogger().info('Here are 10 of them {}.'.format(random.sample(self.queue.results, 10))) + colorlog.getLogger().info('Here are 10 of them {}.'.format(random.sample(list(self.queue.results), 10))) colorlog.getLogger().info('Writing all words to a file...') WriterHelper.write_to_txt(self.get_wiki_name(), self.queue.results)