From 27d548c5449453cc97db1ed8f5f1dac95d349623 Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Thu, 21 Oct 2010 23:48:12 +0800 Subject: [PATCH 01/20] linked other rdfapi --- dashboard.html | 1 + 1 file changed, 1 insertion(+) diff --git a/dashboard.html b/dashboard.html index 6ba7cfb..b2db549 100644 --- a/dashboard.html +++ b/dashboard.html @@ -27,6 +27,7 @@ - their API is 'evolving...' - I've copied the entire tabulator/ tree here for now; hopefully we need just a few files. - The js needed here comes from : hg clone http://dig.csail.mit.edu/hg/tabulator + - future, might use this http://webr3.org/apps/play/api/foaf.html */ var http = require("http"); From 21933a2d33faab781cdbe77868b3252cac769c23 Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 00:41:42 +0800 Subject: [PATCH 02/20] commented out some logging, and display current fetching url in status bar --- dashboard.html | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/dashboard.html b/dashboard.html index b2db549..cc670ce 100644 --- a/dashboard.html +++ b/dashboard.html @@ -37,6 +37,7 @@ var div = document.createElement('div'); div.innerHTML = s; document.getElementById("output").appendChild(div); + hold(10); // woah there cowboy... (calm down a bit, let browser have a life) } @@ -84,7 +85,7 @@ function ingest(kb,uri) { - print("Ingesting new kb of length " + kb.statements.length + " from " +uri); +// print("Ingesting new kb of length " + kb.statements.length + " from " +uri); update_counter('docs', 1); update_counter('triples', kb.statements.length); @@ -96,16 +97,20 @@ } var elsewhere = seeAlsos(kb); var mugs = images(kb); - print("New places to look: "+elsewhere); - print("Images: "+mugs); - // scan results - var results = kb.statementsMatching(undefined, new $rdf.Symbol('http://xmlns.com/foaf/0.1/name'), undefined ); + //print("New places to look: "+elsewhere); + //print("Images: "+mugs); + + + /* +var results = kb.statementsMatching(undefined, new $rdf.Symbol('http://xmlns.com/foaf/0.1/name'), undefined ); for (var i = 0; i < results.length; i++) { li = document.createElement('li'); li.textContent = results[i].object; // print("Got name: "+li.textContent); document.getElementById('objects').appendChild(li); } + + */ // crawl docs found here: cutil.waitforAll(crawl, elsewhere); @@ -169,7 +174,7 @@ print('not crawling '+uri+' again'); return; } - print('crawling '+uri); +// print('crawling '+uri); try { var data = fetch(uri); } @@ -177,7 +182,7 @@ print ("Error fetching data from "+uri+": "+e); return; } - print ("Got data from " + uri); +// print ("Got data from " + uri); var kb = new $rdf.IndexedFormula(); var p = new $rdf.RDFParser(kb); try { @@ -193,7 +198,8 @@ // To make this super robust, let's limit number of concurrent http // connections to 10: var fetch = cutil.makeBoundedFunction(function(uri) { - print('fetching '+uri); +// print('fetching '+uri); + document.getElementById('url').innerHTML = uri; return http.xml(["util/p.php", {u:uri}]); }, 10); @@ -217,13 +223,16 @@ +
docs: factoids: people: sources: images: -
+| +url: +

From 0d367346bf9a0bc0016cfc6c4a1d7c68d63d6f0d Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 00:48:38 +0800 Subject: [PATCH 03/20] updateStatus(s) working --- dashboard.html | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/dashboard.html b/dashboard.html index cc670ce..1a5a0f9 100644 --- a/dashboard.html +++ b/dashboard.html @@ -40,6 +40,11 @@ hold(10); // woah there cowboy... (calm down a bit, let browser have a life) } +function updateStatus(s) { + var div = document.createElement('div'); + document.getElementById("status").innerHTML = s; + hold(10); +} var counters = { docs: 0, people: 0, triples: 0, sources: 0, images: 0 }; var seen = { uri: {} /* could expand this to remove duplicates in people,images, etc: people: {}, images: {} */ }; @@ -169,12 +174,13 @@ // document.getElementById("dashboard").innerHTML == counter; } + function crawl(uri) { if (have_seen('uri', uri)) { - print('not crawling '+uri+' again'); + updateStatus('not crawling '+uri+' again'); return; } -// print('crawling '+uri); + updateStatus('crawling '+uri); try { var data = fetch(uri); } @@ -182,7 +188,7 @@ print ("Error fetching data from "+uri+": "+e); return; } -// print ("Got data from " + uri); + updateStatus("Got data from " + uri); var kb = new $rdf.IndexedFormula(); var p = new $rdf.RDFParser(kb); try { @@ -198,7 +204,7 @@ // To make this super robust, let's limit number of concurrent http // connections to 10: var fetch = cutil.makeBoundedFunction(function(uri) { -// print('fetching '+uri); + updateStatus('fetching '+uri); document.getElementById('url').innerHTML = uri; return http.xml(["util/p.php", {u:uri}]); }, 10); @@ -231,7 +237,8 @@ sources: images: | -url: +url:
+status:

From aac466af0933af9f44175e9a15f997b6e9e32b9a Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 01:14:15 +0800 Subject: [PATCH 04/20] 100 concurrent now, not 10. Added some timing code from Alex. --- dashboard.html | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/dashboard.html b/dashboard.html index 1a5a0f9..c357564 100644 --- a/dashboard.html +++ b/dashboard.html @@ -33,6 +33,14 @@ var http = require("http"); var cutil = require('cutil'); + + + +var aggregateTime = 0, requestCount = 0; // for timings + + + + function print(s) { var div = document.createElement('div'); div.innerHTML = s; @@ -206,8 +214,15 @@ var fetch = cutil.makeBoundedFunction(function(uri) { updateStatus('fetching '+uri); document.getElementById('url').innerHTML = uri; - return http.xml(["util/p.php", {u:uri}]); -}, 10); + + var start = new Date(); + var rv = http.xml(["util/p.php", {u:uri}]); + aggregateTime += (new Date() - start); + ++requestCount; + document.getElementById("av-req-duration").innerHTML = aggregateTime/requestCount; + return rv; + // return http.xml(["util/p.php", {u:uri}]); +}, 100); // storage @@ -239,6 +254,7 @@ | url:
status: +time:

From bcccdcbbe2ec6721a782077864fca4ffbe03deff Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 01:31:59 +0800 Subject: [PATCH 05/20] floating menu w/ css --- dashboard.html | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/dashboard.html b/dashboard.html index c357564..f1e3fe4 100644 --- a/dashboard.html +++ b/dashboard.html @@ -209,8 +209,7 @@ ingest(kb,uri); } -// To make this super robust, let's limit number of concurrent http -// connections to 10: +// To make this super robust, let's limit number of concurrent http connections to 10: var fetch = cutil.makeBoundedFunction(function(uri) { updateStatus('fetching '+uri); document.getElementById('url').innerHTML = uri; @@ -224,11 +223,7 @@ // return http.xml(["util/p.php", {u:uri}]); }, 100); - -// storage -// http://www.rajdeepd.com/articles/chrome/localstrg/LocalStorageSample.htm#section3 - - +// storage http://www.rajdeepd.com/articles/chrome/localstrg/LocalStorageSample.htm#section3 print("Initializing..."); waitfor { setup(); @@ -240,22 +235,26 @@ print("All done"); + - - -
-docs: -factoids: -people: -sources: -images: -| -url:
-status: -time: + +
+ +
+ docs: + factoids: + people: + queue: + images:
+ url:
+ status: + time:
+


@@ -263,9 +262,5 @@
-

People in a FOAF File

-
    -
- From e535c31e0f871c02b985e011896f316a60bc5342 Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 01:53:49 +0800 Subject: [PATCH 06/20] factoidal --- dashboard.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dashboard.html b/dashboard.html index f1e3fe4..3b6f206 100644 --- a/dashboard.html +++ b/dashboard.html @@ -49,7 +49,6 @@ } function updateStatus(s) { - var div = document.createElement('div'); document.getElementById("status").innerHTML = s; hold(10); } @@ -221,7 +220,7 @@ document.getElementById("av-req-duration").innerHTML = aggregateTime/requestCount; return rv; // return http.xml(["util/p.php", {u:uri}]); -}, 100); +}, 10); // storage http://www.rajdeepd.com/articles/chrome/localstrg/LocalStorageSample.htm#section3 print("Initializing..."); @@ -231,6 +230,7 @@ or { require('dom').waitforEvent('stop', 'click'); print("Cancelled"); + updateStatus("Cancelled. Game over..."); } print("All done"); From aba67114abf58c555206b02aeb24f88b5cc934cd Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 02:24:51 +0800 Subject: [PATCH 07/20] sort of limiting images to a gallery of 10. whole app seems to hang a lot > --- dashboard.html | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/dashboard.html b/dashboard.html index 3b6f206..6356973 100644 --- a/dashboard.html +++ b/dashboard.html @@ -170,11 +170,22 @@ } function showpic(u) { - var p = document.getElementById("pic"); + var rand_no = Math.random(); + rand_no = rand_no * 10; + rand_no = Math.ceil(rand_no); var img = document.createElement('img'); + var d = document.getElementById('g'+rand_no); + print ("assigning image "+u+ " to gallery slot g"+rand_no); img.src= u; - img.width=100; - p.appendChild(img); + img.height=150; + + if (d.firstChild != null) { + print("replacing image for "+rand_no); + d.replaceChild(d.firstChild, img); + } else { + print("creating first image for "+rand_no); + d.appendChild(img); + } } function updateDashboard() { @@ -256,8 +267,32 @@

-

+
+
+
+
+
+
+
+ + + + +
+

From bd9d96de3a3d3ead20ea577d024d25e975317059 Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 02:33:14 +0800 Subject: [PATCH 08/20] oopsie, try/catch works better. --- dashboard.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dashboard.html b/dashboard.html index 6356973..7452eb1 100644 --- a/dashboard.html +++ b/dashboard.html @@ -179,10 +179,10 @@ img.src= u; img.height=150; - if (d.firstChild != null) { +try { print("replacing image for "+rand_no); d.replaceChild(d.firstChild, img); - } else { + } catch(e) { print("creating first image for "+rand_no); d.appendChild(img); } From 53b0cd9128e323e1d33b3d66ccc87f274470450c Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 02:35:20 +0800 Subject: [PATCH 09/20] replaceChild args were reversed --- dashboard.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboard.html b/dashboard.html index 7452eb1..abb8afc 100644 --- a/dashboard.html +++ b/dashboard.html @@ -181,7 +181,7 @@ try { print("replacing image for "+rand_no); - d.replaceChild(d.firstChild, img); + d.replaceChild(img, d.firstChild); } catch(e) { print("creating first image for "+rand_no); d.appendChild(img); From 742500e01eb0c53b232949723f417d22d15d691f Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 02:39:44 +0800 Subject: [PATCH 10/20] less noisy output --- dashboard.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dashboard.html b/dashboard.html index abb8afc..8e66f34 100644 --- a/dashboard.html +++ b/dashboard.html @@ -175,15 +175,15 @@ rand_no = Math.ceil(rand_no); var img = document.createElement('img'); var d = document.getElementById('g'+rand_no); - print ("assigning image "+u+ " to gallery slot g"+rand_no); + // print ("assigning image "+u+ " to gallery slot g"+rand_no); img.src= u; img.height=150; try { - print("replacing image for "+rand_no); + // print("replacing image for "+rand_no); d.replaceChild(img, d.firstChild); } catch(e) { - print("creating first image for "+rand_no); + // print("creating first image for "+rand_no); d.appendChild(img); } } From dc541a53d97958958c6a81554407f92aa4dbb2bd Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Fri, 22 Oct 2010 22:14:50 +0800 Subject: [PATCH 11/20] added timeout of 8s for urls (thanks alex, again) --- dashboard.html | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dashboard.html b/dashboard.html index 8e66f34..9557d5e 100644 --- a/dashboard.html +++ b/dashboard.html @@ -225,7 +225,15 @@ document.getElementById('url').innerHTML = uri; var start = new Date(); - var rv = http.xml(["util/p.php", {u:uri}]); + + waitfor { + var rv = http.xml(["util/p.php", {u:uri}]); + } + or { + hold(8000); // 8s timeout + throw("timeout for url "+uri); + } + aggregateTime += (new Date() - start); ++requestCount; document.getElementById("av-req-duration").innerHTML = aggregateTime/requestCount; From ead3e1e6236111313ad647e4ab4f613eca404ce6 Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Sat, 23 Oct 2010 00:39:01 +0800 Subject: [PATCH 12/20] minimised textual output. hid the poorly-named sources counter. --- dashboard.html | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/dashboard.html b/dashboard.html index 9557d5e..4634e6a 100644 --- a/dashboard.html +++ b/dashboard.html @@ -38,7 +38,16 @@ var aggregateTime = 0, requestCount = 0; // for timings - +var skip_list = [ 'kwark.org', // wonky data + 'asemantics.com', // missing sites + 'impressive.net', // http auth (need to test handling) + 'lagrange.net', // ditto + 'geocities.com', // RIP + 'picdiary.com', // awol + 'localhost', // + '127.0.0.1', // + 'kwark.yi.org', // + ]; function print(s) { @@ -73,7 +82,7 @@ // turtle/n3 'http://dig.csail.mit.edu/People/kennyluck#I' - print("setting waitForAll on plan of "+plan); + updateStatus("setting waitForAll on plan of "+plan); cutil.waitforAll(crawl, plan); } @@ -92,6 +101,22 @@ return true; // seen before } + + + +function matches_skip_list(u) { + if (!u) { return; } + for (var i = 0; i < skip_list.length; i++) { + // print("checking "+u+" against "+skip_list[i]); + if ( u.indexOf( skip_list[i]) != -1) + { + // print("on skiplist: "+u +" ~ "+skip_list[i]); + return true; + } + } + return false; +} + // see also $rdf.IndexedFormula in tabulator/chrome/content/js/rdf/rdflib.js @@ -136,7 +161,7 @@ for (var i = 0; i < results.length; i++) { //print("result: sub: "+ results[i].subject.uri + " pred: " + results[i].predicate.uri + " obj: " + results[i].object.uri ); r.push(results[i].object.uri); - update_counter('sources',1); // assumes each is new + // update_counter('sources',1); // assumes each is new } return(r); } @@ -198,12 +223,19 @@ updateStatus('not crawling '+uri+' again'); return; } + + + if (matches_skip_list(uri)) { + updateStatus('not crawling '+uri+' - on the skip_list'); + return; + } + updateStatus('crawling '+uri); try { var data = fetch(uri); } catch(e) { - print ("Error fetching data from "+uri+": "+e); + updateStatus("Error fetching data from "+uri+": "+e); return; } updateStatus("Got data from " + uri); @@ -213,7 +245,7 @@ p.parse(data, uri, uri); } catch (e) { - print ("Parse error on "+uri+": "+e); + updateStatus("Parse error on "+uri+": "+e); return; } ingest(kb,uri); @@ -242,7 +274,7 @@ }, 10); // storage http://www.rajdeepd.com/articles/chrome/localstrg/LocalStorageSample.htm#section3 -print("Initializing..."); +updateStatus("Initializing..."); waitfor { setup(); } @@ -255,7 +287,7 @@ @@ -267,7 +299,7 @@ docs: factoids: people: - queue: + images:
url:
status: From d8bab70f479c8b4afe7c4d7d381c3878886a6ffc Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Sat, 23 Oct 2010 00:43:02 +0800 Subject: [PATCH 13/20] skiplist for images too. --- dashboard.html | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dashboard.html b/dashboard.html index 4634e6a..47a6208 100644 --- a/dashboard.html +++ b/dashboard.html @@ -40,8 +40,8 @@ var skip_list = [ 'kwark.org', // wonky data 'asemantics.com', // missing sites - 'impressive.net', // http auth (need to test handling) - 'lagrange.net', // ditto + 'impressive.net', // http auth (need to test handling); on an image not foaf though + 'lag-range.net', // ditto 'geocities.com', // RIP 'picdiary.com', // awol 'localhost', // @@ -195,6 +195,9 @@ } function showpic(u) { + + if (matches_skip_list(u) { return; } // no missing photos, http etc if we happen to know better. + var rand_no = Math.random(); rand_no = rand_no * 10; rand_no = Math.ceil(rand_no); From 49474488d83f2914299c520f6c1b57f4958e13cc Mon Sep 17 00:00:00 2001 From: Dan Brickley Date: Sat, 23 Oct 2010 04:50:10 +0800 Subject: [PATCH 14/20] google sgapi todo --- dashboard.html | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dashboard.html b/dashboard.html index 47a6208..82088e0 100644 --- a/dashboard.html +++ b/dashboard.html @@ -6,6 +6,9 @@ // TODO: // * RDFa parser, http://code.google.com/p/rdfquery/source/browse/trunk/jquery.rdfa.js // * be a real crawler, see also view-source:http://83.84.224.61/workbench/crawler.html +// * add Google SGAPI lookups, eg. +// http://socialgraph.apis.google.com/lookup?q=http://danbri.org/foaf.rdf&fme=1&edi=1&edo=1&pretty=1&callback= + /* Note: this is written using http://stratifiedjs.org/ - an extension of Javascript @@ -196,7 +199,7 @@ function showpic(u) { - if (matches_skip_list(u) { return; } // no missing photos, http etc if we happen to know better. + if (matches_skip_list(u)) { return; } // no missing photos, http etc if we happen to know better. var rand_no = Math.random(); rand_no = rand_no * 10; From 7e9d5ff3390a9355f6323fcd7b5f0d1cf2b2a784 Mon Sep 17 00:00:00 2001 From: afri Date: Mon, 25 Oct 2010 19:14:55 +0200 Subject: [PATCH 15/20] agent based crawler --- agent_crawler.html | 170 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 agent_crawler.html diff --git a/agent_crawler.html b/agent_crawler.html new file mode 100644 index 0000000..ec3522b --- /dev/null +++ b/agent_crawler.html @@ -0,0 +1,170 @@ + +StrataScutter + + + + +
Queued docs:
+
Processed docs:
+
Processed triples:
+
Av. Fanout:
+
Av. Request duration:
+
Av. Processing duration:
+
Errors:
+
Timeouts:
+ From 3a648f86d6f10607c0d2f1ffb98f92d9d09bf906 Mon Sep 17 00:00:00 2001 From: afri Date: Mon, 25 Oct 2010 19:33:47 +0200 Subject: [PATCH 16/20] Improve queue filling logic (remove unnecessary hold()). --- agent_crawler.html | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/agent_crawler.html b/agent_crawler.html index ec3522b..f0427c7 100644 --- a/agent_crawler.html +++ b/agent_crawler.html @@ -61,18 +61,15 @@ // helper to add uris to the crawl queue: function fill_queue(uris) { increment('fanout', uris.length); + if (crawl_queue.count() >= MAX_PENDING_DOCS) { + //c.log("Too many uris lined up for crawling"); + return; + } for (var i=0,uri; uri=uris[i]; ++i) if (!seen[uri]) { seen[uri] = true; - waitfor { - crawl_queue.put(uri); - increment('queued'); - } - or { - hold(1000); - c.log("Too many uris lined up for crawling"); - return; - } + crawl_queue.put(uri); + increment('queued'); } } From fdc8ac067e4d9767ce6dc541d0d5e52726a9b898 Mon Sep 17 00:00:00 2001 From: afri Date: Tue, 26 Oct 2010 08:35:41 +0200 Subject: [PATCH 17/20] improve queue capping logic. --- agent_crawler.html | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/agent_crawler.html b/agent_crawler.html index f0427c7..3a7e505 100644 --- a/agent_crawler.html +++ b/agent_crawler.html @@ -61,13 +61,13 @@ // helper to add uris to the crawl queue: function fill_queue(uris) { increment('fanout', uris.length); - if (crawl_queue.count() >= MAX_PENDING_DOCS) { - //c.log("Too many uris lined up for crawling"); - return; - } for (var i=0,uri; uri=uris[i]; ++i) if (!seen[uri]) { seen[uri] = true; + if (crawl_queue.count() >= MAX_PENDING_DOCS) { + //c.log("Too many uris lined up for crawling"); + return; + } crawl_queue.put(uri); increment('queued'); } From 86db75fe4787bfbe0c2f6216a85dbce8705e7cad Mon Sep 17 00:00:00 2001 From: afri Date: Tue, 26 Oct 2010 11:40:52 +0200 Subject: [PATCH 18/20] more work on agent_crawler. --- agent_crawler.html | 172 ++++++++++++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 58 deletions(-) diff --git a/agent_crawler.html b/agent_crawler.html index 3a7e505..20424e6 100644 --- a/agent_crawler.html +++ b/agent_crawler.html @@ -1,7 +1,10 @@ -StrataScutter - + + StrataScutter + - +
Queued docs:
Processed docs:
Processed triples:
Av. Fanout:
+
Av. Doc Size (kB):
Av. Request duration:
Av. Processing duration:
Errors:
Timeouts:
+
+ From 6c4d806f2ad4c81ab2fbc48cc24943fbebc9996a Mon Sep 17 00:00:00 2001 From: afri Date: Tue, 26 Oct 2010 11:42:10 +0200 Subject: [PATCH 19/20] Fix oni-apollo path. --- agent_crawler.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent_crawler.html b/agent_crawler.html index 20424e6..af75ed6 100644 --- a/agent_crawler.html +++ b/agent_crawler.html @@ -1,7 +1,7 @@ - + StrataScutter