From 1392acb7cd47c240b7141b2a1e1ae3734f9e3d9d Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Thu, 8 Sep 2016 10:46:56 +0200 Subject: [PATCH 01/10] simple initial domain support --- mediawiki.py | 4 ++-- yamdwe.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mediawiki.py b/mediawiki.py index 136a5f0..ad980b0 100644 --- a/mediawiki.py +++ b/mediawiki.py @@ -11,9 +11,9 @@ from pprint import pprint class Importer(object): - def __init__(self, api_url, http_user=None, http_pass="", wiki_user=None, wiki_pass="", verbose=False): + def __init__(self, api_url, http_user=None, http_pass="", wiki_user=None, wiki_pass="", wiki_domain="", verbose=False): self.verbose = verbose - self.mw = simplemediawiki.MediaWiki(api_url,http_user=http_user,http_password=http_pass) + self.mw = simplemediawiki.MediaWiki(api_url, http_user=http_user, http_password=http_pass, domain=wiki_domain) # login if necessary if wiki_user is not None: print("Logging in as %s..." % wiki_user) diff --git a/yamdwe.py b/yamdwe.py index b644ab5..f8f8d9f 100755 --- a/yamdwe.py +++ b/yamdwe.py @@ -38,7 +38,7 @@ def main(): if not args.mediawiki.endswith("api.php"): print("WARNING: Mediawiki URL does not end in 'api.php'... This has to be the URL of the Mediawiki API, not just the wiki. If you can't export anything, try adding '/api.php' to the wiki URL.") - importer = mediawiki.Importer(args.mediawiki, args.http_user, args.http_pass, args.wiki_user, args.wiki_pass, args.verbose) + importer = mediawiki.Importer(args.mediawiki, args.http_user, args.http_pass, args.wiki_user, args.wiki_pass, args.wiki_domain, args.verbose) exporter = dokuwiki.Exporter(args.dokuwiki) # Set the wikicontent's definition of File: and Image: prefixes (varies by language settings) @@ -49,7 +49,7 @@ def main(): pages = importer.get_all_pages() print("Found %d pages to export..." % len(pages)) - # Add a shameless "exported by yamdwe" note to the front page of the wiki + # Add a shameless "exported by yamdwe" note to the front page of the wiki - really shameless, but I'll keep it mainpage = importer.get_main_pagetitle() for page in pages: if page["title"] == mainpage: @@ -84,6 +84,7 @@ def main(): arguments.add_argument('--http_pass', help="Password for HTTP basic auth (if --http_user is specified but not --http_pass, yamdwe will prompt for a password)") arguments.add_argument('--wiki_user', help="Mediawiki login username") arguments.add_argument('--wiki_pass', help="Mediawiki login password (if --wiki_user is specified but not --wiki_pass, yamdwe will prompt for a password)") +arguments.add_argument('--wiki_domain', help="Mediawiki login domain") arguments.add_argument('-v', '--verbose',help="Print verbose progress and error messages", action="store_true") arguments.add_argument('mediawiki', metavar='MEDIAWIKI_API_URL', help="URL of mediawiki's api.php file (something like http://mysite/wiki/api.php)") arguments.add_argument('dokuwiki', metavar='DOKUWIKI_ROOT', help="Root path to an existing dokuwiki installation to add the Mediawiki pages to (can be a brand new install.)") From e517e12ca53d97e10922fa6f2469ad236f56b8be Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Thu, 8 Sep 2016 15:21:44 +0200 Subject: [PATCH 02/10] pull all pages and not only the first 10 --- mediawiki.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/mediawiki.py b/mediawiki.py index ad980b0..abc79ef 100644 --- a/mediawiki.py +++ b/mediawiki.py @@ -34,14 +34,30 @@ def verbose_print(self, msg): if self.verbose: print(msg) - def get_all_pages(self): + def get_all_pages(self, limit=500, more=True): + # the mediawiki api right now limits the pulled pages to 10 + # the maximum allowed is 500 + # after the first pages are pulled there is no check if there are additional pages to load + # if more is set one additional pull is done that resumes at the last pulled entry + # and it stops if the last page is the last page ;) """ Slurp all pages down from the mediawiki instance, together with all revisions including content. WARNING: Hits API hard, don't do this without knowledge/permission of wiki operator!! """ - query = {'list' : 'allpages'} - print("Getting list of pages...") + count=1 + newest=0 + query = {'list' : 'allpages', 'aplimit':limit} + print("Getting list of pages 0-%i..." % limit) pages = self._query(query, [ 'allpages' ]) + + while newest != pages[-1]['pageid'] and more: + print("Getting list of pages %i-%i..." % (count * limit, (count + 1) * limit)) + newest = pages[-1]['pageid'] + query = {'list': 'allpages', 'aplimit': 500, 'apfrom': pages[-1]['title']} + pages += self._query(query, ['allpages']) +# print("Newest is %s and the ''newest'' is %s" % (newest, pages[-1]['pageid'])) + count += 1 + self.verbose_print("Got %d pages." % len(pages)) print("Query page revisions (this may take a while)...") for page in pages: From 6de6bf4e1e612a1b5501cc7a74252f63b41d5496 Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Thu, 8 Sep 2016 15:26:31 +0200 Subject: [PATCH 03/10] more information about domain functionality --- yamdwe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yamdwe.py b/yamdwe.py index f8f8d9f..6ebc5cc 100755 --- a/yamdwe.py +++ b/yamdwe.py @@ -51,6 +51,7 @@ def main(): # Add a shameless "exported by yamdwe" note to the front page of the wiki - really shameless, but I'll keep it mainpage = importer.get_main_pagetitle() + for page in pages: if page["title"] == mainpage: latest = dict(page["revisions"][0]) @@ -84,7 +85,7 @@ def main(): arguments.add_argument('--http_pass', help="Password for HTTP basic auth (if --http_user is specified but not --http_pass, yamdwe will prompt for a password)") arguments.add_argument('--wiki_user', help="Mediawiki login username") arguments.add_argument('--wiki_pass', help="Mediawiki login password (if --wiki_user is specified but not --wiki_pass, yamdwe will prompt for a password)") -arguments.add_argument('--wiki_domain', help="Mediawiki login domain") +arguments.add_argument('--wiki_domain', help="Mediawiki login domain (needs a non-standard simplemediawiki library )") arguments.add_argument('-v', '--verbose',help="Print verbose progress and error messages", action="store_true") arguments.add_argument('mediawiki', metavar='MEDIAWIKI_API_URL', help="URL of mediawiki's api.php file (something like http://mysite/wiki/api.php)") arguments.add_argument('dokuwiki', metavar='DOKUWIKI_ROOT', help="Root path to an existing dokuwiki installation to add the Mediawiki pages to (can be a brand new install.)") From 2e83a0bbd9195e147a0ccf1633ab999ac0329b77 Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Thu, 8 Sep 2016 15:57:35 +0200 Subject: [PATCH 04/10] Revert "pull all pages and not only the first 10" This reverts commit e517e12ca53d97e10922fa6f2469ad236f56b8be. --- mediawiki.py | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/mediawiki.py b/mediawiki.py index abc79ef..ad980b0 100644 --- a/mediawiki.py +++ b/mediawiki.py @@ -34,30 +34,14 @@ def verbose_print(self, msg): if self.verbose: print(msg) - def get_all_pages(self, limit=500, more=True): - # the mediawiki api right now limits the pulled pages to 10 - # the maximum allowed is 500 - # after the first pages are pulled there is no check if there are additional pages to load - # if more is set one additional pull is done that resumes at the last pulled entry - # and it stops if the last page is the last page ;) + def get_all_pages(self): """ Slurp all pages down from the mediawiki instance, together with all revisions including content. WARNING: Hits API hard, don't do this without knowledge/permission of wiki operator!! """ - count=1 - newest=0 - query = {'list' : 'allpages', 'aplimit':limit} - print("Getting list of pages 0-%i..." % limit) + query = {'list' : 'allpages'} + print("Getting list of pages...") pages = self._query(query, [ 'allpages' ]) - - while newest != pages[-1]['pageid'] and more: - print("Getting list of pages %i-%i..." % (count * limit, (count + 1) * limit)) - newest = pages[-1]['pageid'] - query = {'list': 'allpages', 'aplimit': 500, 'apfrom': pages[-1]['title']} - pages += self._query(query, ['allpages']) -# print("Newest is %s and the ''newest'' is %s" % (newest, pages[-1]['pageid'])) - count += 1 - self.verbose_print("Got %d pages." % len(pages)) print("Query page revisions (this may take a while)...") for page in pages: From b324c260fe97b54daacf9a8823b6544a811323da Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Mon, 12 Sep 2016 09:32:58 +0200 Subject: [PATCH 05/10] "corrected" my stupid mistake --- yamdwe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yamdwe.py b/yamdwe.py index 6ebc5cc..373a453 100755 --- a/yamdwe.py +++ b/yamdwe.py @@ -49,7 +49,7 @@ def main(): pages = importer.get_all_pages() print("Found %d pages to export..." % len(pages)) - # Add a shameless "exported by yamdwe" note to the front page of the wiki - really shameless, but I'll keep it + # Add a shameless "exported by yamdwe" note to the front page of the wiki mainpage = importer.get_main_pagetitle() for page in pages: @@ -85,7 +85,7 @@ def main(): arguments.add_argument('--http_pass', help="Password for HTTP basic auth (if --http_user is specified but not --http_pass, yamdwe will prompt for a password)") arguments.add_argument('--wiki_user', help="Mediawiki login username") arguments.add_argument('--wiki_pass', help="Mediawiki login password (if --wiki_user is specified but not --wiki_pass, yamdwe will prompt for a password)") -arguments.add_argument('--wiki_domain', help="Mediawiki login domain (needs a non-standard simplemediawiki library )") +arguments.add_argument('--wiki_domain', help="Mediawiki login domain (needs a non-standard simplemediawiki library)") arguments.add_argument('-v', '--verbose',help="Print verbose progress and error messages", action="store_true") arguments.add_argument('mediawiki', metavar='MEDIAWIKI_API_URL', help="URL of mediawiki's api.php file (something like http://mysite/wiki/api.php)") arguments.add_argument('dokuwiki', metavar='DOKUWIKI_ROOT', help="Root path to an existing dokuwiki installation to add the Mediawiki pages to (can be a brand new install.)") From aee6b427972ca793c2c3988d3077f4dfb4ad4a07 Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Wed, 5 Oct 2016 13:12:23 +0200 Subject: [PATCH 06/10] mentioned the domain functionality in the README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 86f0257..4bb3b1e 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ To start an export, you will need the URL of the mediawiki API (usually http://m yamdwe.py MEDIAWIKI_API_URL DOKUWIKI_ROOT_PATH -If you need to log in to to your Mediawiki install (either with a Mediawiki username, or via HTTP Basic Auth) then run `yamdwe.py -h` to view the command line options for authentication. +If you need to log in to to your Mediawiki install (either with a Mediawiki username and if you are in a domain with the domain-name, or via HTTP Basic Auth) then run `yamdwe.py -h` to view the command line options for authentication. If installation goes well it should print the names of pages and images as it is exporting, and finally print "Done". This process can be slow, and can load up the Mediawiki server for large wikis. From 9bb119eabedb25898c8a4ac823d78a8a8d210bad Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Mon, 10 Oct 2016 12:11:16 +0200 Subject: [PATCH 07/10] make it not fail without a domain set --- mediawiki.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mediawiki.py b/mediawiki.py index ad980b0..6802e41 100644 --- a/mediawiki.py +++ b/mediawiki.py @@ -11,9 +11,12 @@ from pprint import pprint class Importer(object): - def __init__(self, api_url, http_user=None, http_pass="", wiki_user=None, wiki_pass="", wiki_domain="", verbose=False): + def __init__(self, api_url, http_user=None, http_pass="", wiki_user=None, wiki_pass="", wiki_domain=None, verbose=False): self.verbose = verbose - self.mw = simplemediawiki.MediaWiki(api_url, http_user=http_user, http_password=http_pass, domain=wiki_domain) + if wiki_domain: + self.mw = simplemediawiki.MediaWiki(api_url, http_user=http_user, http_password=http_pass, domain=wiki_domain) + else: + self.mw = simplemediawiki.MediaWiki(api_url, http_user=http_user, http_password=http_pass) # login if necessary if wiki_user is not None: print("Logging in as %s..." % wiki_user) From 2eb8e921baffca76a443018af2f60ba068a528a0 Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Mon, 10 Oct 2016 12:24:18 +0200 Subject: [PATCH 08/10] simple check if simplemediawiki has the domain functionality --- yamdwe.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/yamdwe.py b/yamdwe.py index 373a453..b0f705a 100755 --- a/yamdwe.py +++ b/yamdwe.py @@ -15,6 +15,7 @@ import argparse, sys, codecs, locale, getpass, datetime from pprint import pprint import mediawiki, dokuwiki, wikicontent +import inspect def main(): # the wikicontent code (that uses visitor module) tends to recurse quite deeply for complex pages @@ -38,7 +39,10 @@ def main(): if not args.mediawiki.endswith("api.php"): print("WARNING: Mediawiki URL does not end in 'api.php'... This has to be the URL of the Mediawiki API, not just the wiki. If you can't export anything, try adding '/api.php' to the wiki URL.") - importer = mediawiki.Importer(args.mediawiki, args.http_user, args.http_pass, args.wiki_user, args.wiki_pass, args.wiki_domain, args.verbose) + if "domain" in inspect.getargspec(simplemediawiki.MediaWiki.__init__)[0]: + importer = mediawiki.Importer(args.mediawiki, args.http_user, args.http_pass, args.wiki_user, args.wiki_pass, args.wiki_domain, args.verbose) + else: + importer = mediawiki.Importer(args.mediawiki, args.http_user, args.http_pass, args.wiki_user, args.wiki_pass, args.verbose) exporter = dokuwiki.Exporter(args.dokuwiki) # Set the wikicontent's definition of File: and Image: prefixes (varies by language settings) @@ -85,7 +89,8 @@ def main(): arguments.add_argument('--http_pass', help="Password for HTTP basic auth (if --http_user is specified but not --http_pass, yamdwe will prompt for a password)") arguments.add_argument('--wiki_user', help="Mediawiki login username") arguments.add_argument('--wiki_pass', help="Mediawiki login password (if --wiki_user is specified but not --wiki_pass, yamdwe will prompt for a password)") -arguments.add_argument('--wiki_domain', help="Mediawiki login domain (needs a non-standard simplemediawiki library)") +if "domain" in inspect.getargspec(simplemediawiki.MediaWiki.__init__)[0]: + arguments.add_argument('--wiki_domain', help="Mediawiki login domain (needs a non-standard simplemediawiki library)") arguments.add_argument('-v', '--verbose',help="Print verbose progress and error messages", action="store_true") arguments.add_argument('mediawiki', metavar='MEDIAWIKI_API_URL', help="URL of mediawiki's api.php file (something like http://mysite/wiki/api.php)") arguments.add_argument('dokuwiki', metavar='DOKUWIKI_ROOT', help="Root path to an existing dokuwiki installation to add the Mediawiki pages to (can be a brand new install.)") From 69f29e413a47f2385bf33b8f9ebd168257c8ed45 Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Mon, 10 Oct 2016 12:25:27 +0200 Subject: [PATCH 09/10] added the smw import --- yamdwe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yamdwe.py b/yamdwe.py index b0f705a..67ee685 100755 --- a/yamdwe.py +++ b/yamdwe.py @@ -15,7 +15,8 @@ import argparse, sys, codecs, locale, getpass, datetime from pprint import pprint import mediawiki, dokuwiki, wikicontent -import inspect +# only needed to check for domain functionality +import simplemediawiki, inspect def main(): # the wikicontent code (that uses visitor module) tends to recurse quite deeply for complex pages From 4c1ebe27fee0cb4e343abcf4b48dcfcc57e1df4b Mon Sep 17 00:00:00 2001 From: Thomas Schneider Date: Mon, 10 Oct 2016 12:30:41 +0200 Subject: [PATCH 10/10] Updated README --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 4bb3b1e..3930b6d 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,10 @@ To start an export, you will need the URL of the mediawiki API (usually http://m If you need to log in to to your Mediawiki install (either with a Mediawiki username and if you are in a domain with the domain-name, or via HTTP Basic Auth) then run `yamdwe.py -h` to view the command line options for authentication. +Domain functionality is added through the "develop" branch of this [simplemediawiki fork](https://github.com/BlackLotus/python-simplemediawiki/tree/develop) and can be used through. + + yamdwe.py --wiki_domain WIKI_DOMAIN MEDIAWIKI_API_URL DOKUWIKI_ROOT_PATH + If installation goes well it should print the names of pages and images as it is exporting, and finally print "Done". This process can be slow, and can load up the Mediawiki server for large wikis. Yamdwe may warn you at the end that it is unable to set [correct permissions for the Dokuwiki data directories and files](https://www.dokuwiki.org/install:permissions) - regardless, you should check and correct these manually.