diff --git a/agent_crawler.html b/agent_crawler.html
new file mode 100644
index 0000000..b82793a
--- /dev/null
+++ b/agent_crawler.html
@@ -0,0 +1,223 @@
+
+
+
+
+
+ StrataScutter
+
+
+
+
+ Queued docs:
+ Processed docs:
+ Processed triples:
+ Av. Fanout:
+ Av. Doc Size (kB):
+ Av. Request duration:
+ Av. Processing duration:
+ Errors:
+ Timeouts:
+
+
+ Agents:
+
+
+
+
diff --git a/dashboard.html b/dashboard.html
index 6ba7cfb..82088e0 100644
--- a/dashboard.html
+++ b/dashboard.html
@@ -6,6 +6,9 @@
// TODO:
// * RDFa parser, http://code.google.com/p/rdfquery/source/browse/trunk/jquery.rdfa.js
// * be a real crawler, see also view-source:http://83.84.224.61/workbench/crawler.html
+// * add Google SGAPI lookups, eg.
+// http://socialgraph.apis.google.com/lookup?q=http://danbri.org/foaf.rdf&fme=1&edi=1&edo=1&pretty=1&callback=
+
/*
Note: this is written using http://stratifiedjs.org/ - an extension of Javascript
@@ -27,17 +30,40 @@
- their API is 'evolving...'
- I've copied the entire tabulator/ tree here for now; hopefully we need just a few files.
- The js needed here comes from : hg clone http://dig.csail.mit.edu/hg/tabulator
+ - future, might use this http://webr3.org/apps/play/api/foaf.html
*/
var http = require("http");
var cutil = require('cutil');
+
+
+
+var aggregateTime = 0, requestCount = 0; // for timings
+
+var skip_list = [ 'kwark.org', // wonky data
+ 'asemantics.com', // missing sites
+ 'impressive.net', // http auth (need to test handling); on an image not foaf though
+ 'lag-range.net', // ditto
+ 'geocities.com', // RIP
+ 'picdiary.com', // awol
+ 'localhost', //
+ '127.0.0.1', //
+ 'kwark.yi.org', //
+ ];
+
+
function print(s) {
var div = document.createElement('div');
div.innerHTML = s;
document.getElementById("output").appendChild(div);
+ hold(10); // woah there cowboy... (calm down a bit, let browser have a life)
}
+function updateStatus(s) {
+ document.getElementById("status").innerHTML = s;
+ hold(10);
+}
var counters = { docs: 0, people: 0, triples: 0, sources: 0, images: 0 };
var seen = { uri: {} /* could expand this to remove duplicates in people,images, etc: people: {}, images: {} */ };
@@ -59,7 +85,7 @@
// turtle/n3 'http://dig.csail.mit.edu/People/kennyluck#I'
- print("setting waitForAll on plan of "+plan);
+ updateStatus("setting waitForAll on plan of "+plan);
cutil.waitforAll(crawl, plan);
}
@@ -78,12 +104,28 @@
return true; // seen before
}
+
+
+
+function matches_skip_list(u) {
+ if (!u) { return; }
+ for (var i = 0; i < skip_list.length; i++) {
+ // print("checking "+u+" against "+skip_list[i]);
+ if ( u.indexOf( skip_list[i]) != -1)
+ {
+ // print("on skiplist: "+u +" ~ "+skip_list[i]);
+ return true;
+ }
+ }
+ return false;
+}
+
// see also $rdf.IndexedFormula in tabulator/chrome/content/js/rdf/rdflib.js
function ingest(kb,uri) {
- print("Ingesting new kb of length " + kb.statements.length + " from " +uri);
+// print("Ingesting new kb of length " + kb.statements.length + " from " +uri);
update_counter('docs', 1);
update_counter('triples', kb.statements.length);
@@ -95,16 +137,20 @@
}
var elsewhere = seeAlsos(kb);
var mugs = images(kb);
- print("New places to look: "+elsewhere);
- print("Images: "+mugs);
- // scan results
- var results = kb.statementsMatching(undefined, new $rdf.Symbol('http://xmlns.com/foaf/0.1/name'), undefined );
+ //print("New places to look: "+elsewhere);
+ //print("Images: "+mugs);
+
+
+ /*
+var results = kb.statementsMatching(undefined, new $rdf.Symbol('http://xmlns.com/foaf/0.1/name'), undefined );
for (var i = 0; i < results.length; i++) {
li = document.createElement('li');
li.textContent = results[i].object;
// print("Got name: "+li.textContent);
document.getElementById('objects').appendChild(li);
}
+
+ */
// crawl docs found here:
cutil.waitforAll(crawl, elsewhere);
@@ -118,7 +164,7 @@
for (var i = 0; i < results.length; i++) {
//print("result: sub: "+ results[i].subject.uri + " pred: " + results[i].predicate.uri + " obj: " + results[i].object.uri );
r.push(results[i].object.uri);
- update_counter('sources',1); // assumes each is new
+ // update_counter('sources',1); // assumes each is new
}
return(r);
}
@@ -152,87 +198,150 @@
}
function showpic(u) {
- var p = document.getElementById("pic");
+
+ if (matches_skip_list(u)) { return; } // no missing photos, http etc if we happen to know better.
+
+ var rand_no = Math.random();
+ rand_no = rand_no * 10;
+ rand_no = Math.ceil(rand_no);
var img = document.createElement('img');
+ var d = document.getElementById('g'+rand_no);
+ // print ("assigning image "+u+ " to gallery slot g"+rand_no);
img.src= u;
- img.width=100;
- p.appendChild(img);
+ img.height=150;
+
+try {
+ // print("replacing image for "+rand_no);
+ d.replaceChild(img, d.firstChild);
+ } catch(e) {
+ // print("creating first image for "+rand_no);
+ d.appendChild(img);
+ }
}
function updateDashboard() {
// document.getElementById("dashboard").innerHTML == counter;
}
+
function crawl(uri) {
if (have_seen('uri', uri)) {
- print('not crawling '+uri+' again');
+ updateStatus('not crawling '+uri+' again');
return;
}
- print('crawling '+uri);
+
+
+ if (matches_skip_list(uri)) {
+ updateStatus('not crawling '+uri+' - on the skip_list');
+ return;
+ }
+
+ updateStatus('crawling '+uri);
try {
var data = fetch(uri);
}
catch(e) {
- print ("Error fetching data from "+uri+": "+e);
+ updateStatus("Error fetching data from "+uri+": "+e);
return;
}
- print ("Got data from " + uri);
+ updateStatus("Got data from " + uri);
var kb = new $rdf.IndexedFormula();
var p = new $rdf.RDFParser(kb);
try {
p.parse(data, uri, uri);
}
catch (e) {
- print ("Parse error on "+uri+": "+e);
+ updateStatus("Parse error on "+uri+": "+e);
return;
}
ingest(kb,uri);
}
-// To make this super robust, let's limit number of concurrent http
-// connections to 10:
+// To make this super robust, let's limit number of concurrent http connections to 10:
var fetch = cutil.makeBoundedFunction(function(uri) {
- print('fetching '+uri);
- return http.xml(["util/p.php", {u:uri}]);
-}, 10);
+ updateStatus('fetching '+uri);
+ document.getElementById('url').innerHTML = uri;
+ var start = new Date();
-// storage
-// http://www.rajdeepd.com/articles/chrome/localstrg/LocalStorageSample.htm#section3
+ waitfor {
+ var rv = http.xml(["util/p.php", {u:uri}]);
+ }
+ or {
+ hold(8000); // 8s timeout
+ throw("timeout for url "+uri);
+ }
+ aggregateTime += (new Date() - start);
+ ++requestCount;
+ document.getElementById("av-req-duration").innerHTML = aggregateTime/requestCount;
+ return rv;
+ // return http.xml(["util/p.php", {u:uri}]);
+}, 10);
-print("Initializing...");
+// storage http://www.rajdeepd.com/articles/chrome/localstrg/LocalStorageSample.htm#section3
+updateStatus("Initializing...");
waitfor {
setup();
}
or {
require('dom').waitforEvent('stop', 'click');
print("Cancelled");
+ updateStatus("Cancelled. Game over...");
}
print("All done");
+
-
-
-docs:
-factoids:
-people:
-sources:
-images:
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
-
People in a FOAF File
-
+
+