diff --git a/agent_crawler.html b/agent_crawler.html new file mode 100644 index 0000000..b82793a --- /dev/null +++ b/agent_crawler.html @@ -0,0 +1,223 @@ + + + + + + StrataScutter + + + + +
Queued docs:
+
Processed docs:
+
Processed triples:
+
Av. Fanout:
+
Av. Doc Size (kB):
+
Av. Request duration:
+
Av. Processing duration:
+
Errors:
+
Timeouts:
+
+ + diff --git a/dashboard.html b/dashboard.html index 6ba7cfb..82088e0 100644 --- a/dashboard.html +++ b/dashboard.html @@ -6,6 +6,9 @@ // TODO: // * RDFa parser, http://code.google.com/p/rdfquery/source/browse/trunk/jquery.rdfa.js // * be a real crawler, see also view-source:http://83.84.224.61/workbench/crawler.html +// * add Google SGAPI lookups, eg. +// http://socialgraph.apis.google.com/lookup?q=http://danbri.org/foaf.rdf&fme=1&edi=1&edo=1&pretty=1&callback= + /* Note: this is written using http://stratifiedjs.org/ - an extension of Javascript @@ -27,17 +30,40 @@ - their API is 'evolving...' - I've copied the entire tabulator/ tree here for now; hopefully we need just a few files. - The js needed here comes from : hg clone http://dig.csail.mit.edu/hg/tabulator + - future, might use this http://webr3.org/apps/play/api/foaf.html */ var http = require("http"); var cutil = require('cutil'); + + + +var aggregateTime = 0, requestCount = 0; // for timings + +var skip_list = [ 'kwark.org', // wonky data + 'asemantics.com', // missing sites + 'impressive.net', // http auth (need to test handling); on an image not foaf though + 'lag-range.net', // ditto + 'geocities.com', // RIP + 'picdiary.com', // awol + 'localhost', // + '127.0.0.1', // + 'kwark.yi.org', // + ]; + + function print(s) { var div = document.createElement('div'); div.innerHTML = s; document.getElementById("output").appendChild(div); + hold(10); // woah there cowboy... (calm down a bit, let browser have a life) } +function updateStatus(s) { + document.getElementById("status").innerHTML = s; + hold(10); +} var counters = { docs: 0, people: 0, triples: 0, sources: 0, images: 0 }; var seen = { uri: {} /* could expand this to remove duplicates in people,images, etc: people: {}, images: {} */ }; @@ -59,7 +85,7 @@ // turtle/n3 'http://dig.csail.mit.edu/People/kennyluck#I' - print("setting waitForAll on plan of "+plan); + updateStatus("setting waitForAll on plan of "+plan); cutil.waitforAll(crawl, plan); } @@ -78,12 +104,28 @@ return true; // seen before } + + + +function matches_skip_list(u) { + if (!u) { return; } + for (var i = 0; i < skip_list.length; i++) { + // print("checking "+u+" against "+skip_list[i]); + if ( u.indexOf( skip_list[i]) != -1) + { + // print("on skiplist: "+u +" ~ "+skip_list[i]); + return true; + } + } + return false; +} + // see also $rdf.IndexedFormula in tabulator/chrome/content/js/rdf/rdflib.js function ingest(kb,uri) { - print("Ingesting new kb of length " + kb.statements.length + " from " +uri); +// print("Ingesting new kb of length " + kb.statements.length + " from " +uri); update_counter('docs', 1); update_counter('triples', kb.statements.length); @@ -95,16 +137,20 @@ } var elsewhere = seeAlsos(kb); var mugs = images(kb); - print("New places to look: "+elsewhere); - print("Images: "+mugs); - // scan results - var results = kb.statementsMatching(undefined, new $rdf.Symbol('http://xmlns.com/foaf/0.1/name'), undefined ); + //print("New places to look: "+elsewhere); + //print("Images: "+mugs); + + + /* +var results = kb.statementsMatching(undefined, new $rdf.Symbol('http://xmlns.com/foaf/0.1/name'), undefined ); for (var i = 0; i < results.length; i++) { li = document.createElement('li'); li.textContent = results[i].object; // print("Got name: "+li.textContent); document.getElementById('objects').appendChild(li); } + + */ // crawl docs found here: cutil.waitforAll(crawl, elsewhere); @@ -118,7 +164,7 @@ for (var i = 0; i < results.length; i++) { //print("result: sub: "+ results[i].subject.uri + " pred: " + results[i].predicate.uri + " obj: " + results[i].object.uri ); r.push(results[i].object.uri); - update_counter('sources',1); // assumes each is new + // update_counter('sources',1); // assumes each is new } return(r); } @@ -152,87 +198,150 @@ } function showpic(u) { - var p = document.getElementById("pic"); + + if (matches_skip_list(u)) { return; } // no missing photos, http etc if we happen to know better. + + var rand_no = Math.random(); + rand_no = rand_no * 10; + rand_no = Math.ceil(rand_no); var img = document.createElement('img'); + var d = document.getElementById('g'+rand_no); + // print ("assigning image "+u+ " to gallery slot g"+rand_no); img.src= u; - img.width=100; - p.appendChild(img); + img.height=150; + +try { + // print("replacing image for "+rand_no); + d.replaceChild(img, d.firstChild); + } catch(e) { + // print("creating first image for "+rand_no); + d.appendChild(img); + } } function updateDashboard() { // document.getElementById("dashboard").innerHTML == counter; } + function crawl(uri) { if (have_seen('uri', uri)) { - print('not crawling '+uri+' again'); + updateStatus('not crawling '+uri+' again'); return; } - print('crawling '+uri); + + + if (matches_skip_list(uri)) { + updateStatus('not crawling '+uri+' - on the skip_list'); + return; + } + + updateStatus('crawling '+uri); try { var data = fetch(uri); } catch(e) { - print ("Error fetching data from "+uri+": "+e); + updateStatus("Error fetching data from "+uri+": "+e); return; } - print ("Got data from " + uri); + updateStatus("Got data from " + uri); var kb = new $rdf.IndexedFormula(); var p = new $rdf.RDFParser(kb); try { p.parse(data, uri, uri); } catch (e) { - print ("Parse error on "+uri+": "+e); + updateStatus("Parse error on "+uri+": "+e); return; } ingest(kb,uri); } -// To make this super robust, let's limit number of concurrent http -// connections to 10: +// To make this super robust, let's limit number of concurrent http connections to 10: var fetch = cutil.makeBoundedFunction(function(uri) { - print('fetching '+uri); - return http.xml(["util/p.php", {u:uri}]); -}, 10); + updateStatus('fetching '+uri); + document.getElementById('url').innerHTML = uri; + var start = new Date(); -// storage -// http://www.rajdeepd.com/articles/chrome/localstrg/LocalStorageSample.htm#section3 + waitfor { + var rv = http.xml(["util/p.php", {u:uri}]); + } + or { + hold(8000); // 8s timeout + throw("timeout for url "+uri); + } + aggregateTime += (new Date() - start); + ++requestCount; + document.getElementById("av-req-duration").innerHTML = aggregateTime/requestCount; + return rv; + // return http.xml(["util/p.php", {u:uri}]); +}, 10); -print("Initializing..."); +// storage http://www.rajdeepd.com/articles/chrome/localstrg/LocalStorageSample.htm#section3 +updateStatus("Initializing..."); waitfor { setup(); } or { require('dom').waitforEvent('stop', 'click'); print("Cancelled"); + updateStatus("Cancelled. Game over..."); } print("All done"); + - -
-docs: -factoids: -people: -sources: -images: + +
+ +
+ docs: + factoids: + people: + + images:
+ url:
+ status: + time: +

-

+
+
+
+
+
+
+
+ + -
+
+

-

People in a FOAF File

- + +