diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..99fe3d9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+berkeley-entity-models.tgz
+data.tgz
+data/
+expers/
+models/
+project/project/
+project/target/
+target/
+specify_execDir/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..61b103e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,17 @@
+# some random useful functions
+
+TARGET = target/scala-2.11/berkeley-entity-assembly-1.jar
+
+all: $(TARGET)
+
+$(TARGET): $(wildcard src/**/*)
+	sbt assembly
+
+aceTester: $(TARGET)
+	java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.ACETester -dataPath data/ace05/ace05-all-conll
+
+queryModel: $(TARGET)
+	java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.QueryChooser -wikiDBPath models/wiki-db-ace.ser.gz
+
+wikiLimited: $(TARGET)
+	java -cp $(TARGET) edu.berkeley.nlp.entity.preprocess.PreprocessingDriver ++config/base.conf -inputDir ../WikificationACL2011Data/WikipediaSample/RawTextsTrain/ -outputDir /tmp/gggg/raw/ -mode WIKILIMITED
diff --git a/build.sbt b/build.sbt
index 91a4b9b..77738fd 100644
--- a/build.sbt
+++ b/build.sbt
@@ -4,9 +4,10 @@ name := "berkeley-entity"
 
 version := "1"
 
-scalaVersion := "2.11.2"
+scalaVersion := "2.11.6"
 
 assemblySettings
 
 mainClass in assembly := Some("edu.berkeley.nlp.entity.Driver")
 
+unmanagedResourceDirectories in Compile += { baseDirectory.value / "resources/" }
diff --git a/resources/Messages_de.properties b/resources/Messages_de.properties
new file mode 100644
index 0000000..51b38e9
--- /dev/null
+++ b/resources/Messages_de.properties
@@ -0,0 +1,9 @@
+wiki.tags.toc.content=Inhaltsverzeichnis
+wiki.api.url=http://de.wikipedia.org/w/api.php
+wiki.api.category1=Kategorie
+wiki.api.image1=Datei
+wiki.api.image2=Bild
+wiki.api.template1=Vorlage
+wiki.api.category2=Category
+wiki.api.image2=Image
+wiki.api.template2=Template
\ No newline at end of file
diff --git a/resources/Messages_en.properties b/resources/Messages_en.properties
new file mode 100644
index 0000000..6b9e2f0
--- /dev/null
+++ b/resources/Messages_en.properties
@@ -0,0 +1,36 @@
+wiki.tags.toc.content=Contents
+wiki.api.url=http://en.wikipedia.org/w/api.php
+wiki.api.media1=Media
+wiki.api.media2=Media
+wiki.api.special1=Special
+wiki.api.special2=Special
+wiki.api.talk1=Talk
+wiki.api.talk2=Talk
+wiki.api.user1=User
+wiki.api.user2=User
+wiki.api.usertalk1=User_talk
+wiki.api.usertalk2=User_talk
+wiki.api.meta1=Meta
+wiki.api.meta2=Meta
+wiki.api.metatalk1=Meta_talk
+wiki.api.metatalk2=Meta_talk
+wiki.api.image1=Image
+wiki.api.image2=File
+wiki.api.imagetalk1=Image_talk
+wiki.api.imagetalk2=File_talk
+wiki.api.mediawiki1=MediaWiki
+wiki.api.mediawiki2=MediaWiki
+wiki.api.mediawikitalk1=MediaWiki_talk
+wiki.api.mediawikitalk2=MediaWiki_talk
+wiki.api.template1=Template
+wiki.api.template2=Template
+wiki.api.templatetalk1=Template_talk
+wiki.api.templatetalk2=Template_talk
+wiki.api.help1=Help
+wiki.api.help2=Help
+wiki.api.helptalk1=Help_talk
+wiki.api.helptalk2=Help_talk
+wiki.api.category1=Category
+wiki.api.category2=Category
+wiki.api.categorytalk1=Category_talk
+wiki.api.categorytalk2=Category_talk
\ No newline at end of file
diff --git a/resources/Messages_es.properties b/resources/Messages_es.properties
new file mode 100644
index 0000000..bc50428
--- /dev/null
+++ b/resources/Messages_es.properties
@@ -0,0 +1,8 @@
+wiki.tags.toc.content=Contenido
+wiki.api.url=http://es.wikipedia.org/w/api.php
+wiki.api.category1=Categor\u00EDa
+wiki.api.image1=Imagen
+wiki.api.template1=Plantilla
+wiki.api.category2=Category
+wiki.api.image2=Image
+wiki.api.template2=Template
diff --git a/resources/Messages_fr.properties b/resources/Messages_fr.properties
new file mode 100644
index 0000000..2a76842
--- /dev/null
+++ b/resources/Messages_fr.properties
@@ -0,0 +1,8 @@
+wiki.tags.toc.content=Sommaire
+wiki.api.url=http://fr.wikipedia.org/w/api.php
+wiki.api.category1=Cat\u00E9gorie
+wiki.api.image1=Image
+wiki.api.template1=Mod\u00E8le
+wiki.api.category2=Category
+wiki.api.image2=Image
+wiki.api.template2=Template
\ No newline at end of file
diff --git a/resources/Messages_it.properties b/resources/Messages_it.properties
new file mode 100644
index 0000000..97778a3
--- /dev/null
+++ b/resources/Messages_it.properties
@@ -0,0 +1,8 @@
+wiki.tags.toc.content=Indice
+wiki.api.url=http://it.wikipedia.org/w/api.php
+wiki.api.category1=Categoria
+wiki.api.image1=Immagine
+wiki.api.template1=Template
+wiki.api.category2=Category
+wiki.api.image2=File
+wiki.api.template2=Template
\ No newline at end of file
diff --git a/resources/Messages_pt_BR.properties b/resources/Messages_pt_BR.properties
new file mode 100644
index 0000000..e0baaf7
--- /dev/null
+++ b/resources/Messages_pt_BR.properties
@@ -0,0 +1,38 @@
+#Generated by ResourceBundle Editor (http://eclipse-rbe.sourceforge.net)
+
+wiki.api.category1      = Categoria
+wiki.api.category2      = Categoria
+wiki.api.categorytalk1  = Categoria_falar
+wiki.api.categorytalk2  = Categoria_falar
+wiki.api.help1          = Ajuda
+wiki.api.help2          = Ajuda
+wiki.api.helptalk1      = Ajuda_falar
+wiki.api.helptalk2      = Ajuda_falar
+wiki.api.image1         = Imagem
+wiki.api.image2         = Arquivo
+wiki.api.imagetalk1     = Imagem_falar
+wiki.api.imagetalk2     = Arquivo_falar
+wiki.api.media1         = M\u00EDdia
+wiki.api.media2         = M\u00EDdia
+wiki.api.mediawiki1     = MediaWiki
+wiki.api.mediawiki2     = MediaWiki
+wiki.api.mediawikitalk1 = MediaWiki_falar
+wiki.api.mediawikitalk2 = MediaWiki_falar
+wiki.api.meta1          = Meta
+wiki.api.meta2          = Meta
+wiki.api.metatalk1      = Meta_falar
+wiki.api.metatalk2      = Meta_falar
+wiki.api.special1       = Especial
+wiki.api.special2       = Especial
+wiki.api.talk1          = Falar
+wiki.api.talk2          = Falar
+wiki.api.template1      = Modelo
+wiki.api.template2      = Modelo
+wiki.api.templatetalk1  = Modelo_falar
+wiki.api.templatetalk2  = Modelo_falar
+wiki.api.url            = http://br.wikipedia.org/w/api.php
+wiki.api.user1          = Usu\u00E1rio
+wiki.api.user2          = Usu\u00E1rio
+wiki.api.usertalk1      = Usu\u00E1rio_falar
+wiki.api.usertalk2      = Usu\u00E1rio_falar
+wiki.tags.toc.content   = Conte\u00FAdo
diff --git a/resources/interwiki.properties b/resources/interwiki.properties
new file mode 100644
index 0000000..b312b1e
--- /dev/null
+++ b/resources/interwiki.properties
@@ -0,0 +1,392 @@
+be-x-old=http://be-x-old.wikipedia.org/wiki/${title}
+tavi=http://tavi.sourceforge.net/${title}
+xh=http://xh.wikipedia.org/wiki/${title}
+lasvegaswiki=http://wiki.gmnow.com/index.php/${title}
+pmeg=http://www.bertilow.com/pmeg/${title}.php
+warpedview=http://www.warpedview.com/index.php/${title}
+slashdot=http://slashdot.org/article.pl?sid=${title}
+wikimedia=http://wikimediafoundation.org/wiki/${title}
+wikia=http://www.wikia.com/wiki/index.php/${title}
+wo=http://wo.wikipedia.org/wiki/${title}
+jefo=http://www.esperanto-jeunes.org/vikio/index.php?${title}
+openfacts=http://openfacts.berlios.de/index.phtml?title=${title}
+lqwiki=http://wiki.linuxquestions.org/wiki/${title}
+wa=http://wa.wikipedia.org/wiki/${title}
+ciscavate=http://ciscavate.org/index.php/${title}
+demokraatia=http://wiki.demokraatia.ee/
+efnetpythonwiki=http://purl.net/wiki/python/${title}
+mediazilla=http://bugzilla.wikipedia.org/${title}
+wikiquote=http://en.wikiquote.org/wiki/${title}
+jbo=http://jbo.wikipedia.org/wiki/${title}
+vo=http://vo.wikipedia.org/wiki/${title}
+vi=http://vi.wikipedia.org/wiki/${title}
+gamewiki=http://gamewiki.org/wiki/index.php/${title}
+hewikisource=http://he.wikisource.org/wiki/${title}
+ve=http://ve.wikipedia.org/wiki/${title}
+google=http://www.google.com/search?q=${title}
+uz=http://uz.wikipedia.org/wiki/${title}
+drumcorpswiki=http://www.drumcorpswiki.com/index.php/${title}
+nah=http://nah.wikipedia.org/wiki/${title}
+ur=http://ur.wikipedia.org/wiki/${title}
+jiniwiki=http://www.cdegroot.com/cgi-bin/jini?${title}
+uk=http://uk.wikipedia.org/wiki/${title}
+ug=http://ug.wikipedia.org/wiki/${title}
+osi=reference model=http://wiki.tigma.ee/
+mbtest=http://www.usemod.com/cgi-bin/mbtest.pl?${title}
+disinfopedia=http://www.disinfopedia.org/wiki.phtml?title=${title}
+ty=http://ty.wikipedia.org/wiki/${title}
+squeak=http://minnow.cc.gatech.edu/squeak/${title}
+tw=http://tw.wikipedia.org/wiki/${title}
+tlh=http://tlh.wikipedia.org/wiki/${title}
+tt=http://tt.wikipedia.org/wiki/${title}
+ts=http://ts.wikipedia.org/wiki/${title}
+tr=http://tr.wikipedia.org/wiki/${title}
+scoutpedia=http://www.scoutpedia.info/index.php/${title}
+minnan=http://zh-min-nan.wikipedia.org/wiki/${title}
+to=http://to.wikipedia.org/wiki/${title}
+tn=http://tn.wikipedia.org/wiki/${title}
+wikinfo=http://www.wikinfo.org/wiki.php?title=${title}
+s23wiki=http://is-root.de/wiki/index.php/${title}
+tl=http://tl.wikipedia.org/wiki/${title}
+aiwiki=http://www.ifi.unizh.ch/ailab/aiwiki/aiw.cgi?${title}
+tk=http://tk.wikipedia.org/wiki/${title}
+ti=http://ti.wikipedia.org/wiki/${title}
+th=http://th.wikipedia.org/wiki/${title}
+tg=http://tg.wikipedia.org/wiki/${title}
+fr.fr=http://fr.fr.wikinations.org/${title}
+te=http://te.wikipedia.org/wiki/${title}
+csb=http://csb.wikipedia.org/wiki/${title}
+theopedia=http://www.theopedia.com/${title}
+ta=http://ta.wikipedia.org/wiki/${title}
+acadwiki=http://xarch.tu-graz.ac.at/autocad/wiki/${title}
+efnetceewiki=http://purl.net/wiki/c/${title}
+phpwiki=http://phpwiki.sourceforge.net/phpwiki/index.php?${title}
+tmwiki=http://www.EasyTopicMaps.com/?page=${title}
+sw=http://sw.wikipedia.org/wiki/${title}
+benefitswiki=http://www.benefitslink.com/cgi-bin/wiki.cgi?${title}
+ecxei=http://www.ikso.net/cgi-bin/wiki.pl?${title}
+sv=http://sv.wikipedia.org/wiki/${title}
+uea=http://www.tejo.org/uea/${title}
+su=http://su.wikipedia.org/wiki/${title}
+st=http://st.wikipedia.org/wiki/${title}
+ss=http://ss.wikipedia.org/wiki/${title}
+sr=http://sr.wikipedia.org/wiki/${title}
+sq=http://sq.wikipedia.org/wiki/${title}
+so=http://so.wikipedia.org/wiki/${title}
+sn=http://sn.wikipedia.org/wiki/${title}
+sm=http://sm.wikipedia.org/wiki/${title}
+sl=http://sl.wikipedia.org/wiki/${title}
+sk=http://sk.wikipedia.org/wiki/${title}
+cache=http://www.google.com/search?q=cache:${title}
+svgwiki=http://www.protocol7.com/svg-wiki/default.asp?${title}
+si=http://si.wikipedia.org/wiki/${title}
+smikipedia=http://www.smikipedia.org/${title}
+simple=http://simple.wikipedia.org/wiki/${title}
+sh=http://sh.wikipedia.org/wiki/${title}
+sg=http://sg.wikipedia.org/wiki/${title}
+gentoo-wiki=http://gentoo-wiki.com/${title}
+se=http://se.wikipedia.org/wiki/${title}
+webseitzwiki=http://webseitz.fluxent.com/wiki/${title}
+sd=http://sd.wikipedia.org/wiki/${title}
+sc=http://sc.wikipedia.org/wiki/${title}
+jamwiki=http://jamwiki.org/wiki/en/${title}
+sa=http://sa.wikipedia.org/wiki/${title}
+greencheese=http://www.greencheese.org/${title}
+linuxwiki=http://www.linuxwiki.de/${title}
+diveintoosx=http://diveintoosx.org/${title}
+bridgeswiki=http://c2.com/w2/bridges/${title}
+rw=http://rw.wikipedia.org/wiki/${title}
+ru=http://ru.wikipedia.org/wiki/${title}
+corpknowpedia=http://corpknowpedia.org/wiki/index.php/${title}
+echei=http://www.ikso.net/cgi-bin/wiki.pl?${title}
+ro=http://ro.wikipedia.org/wiki/${title}
+rn=http://rn.wikipedia.org/wiki/${title}
+rm=http://rm.wikipedia.org/wiki/${title}
+wikispecies=http://species.wikipedia.org/wiki/${title}
+webdevwikinl=http://www.promo-it.nl/WebDevWiki/index.php?page=${title}
+sourceforge=http://sourceforge.net/${title}
+pythonwiki=http://www.pythonwiki.de/${title}
+roa-rup=http://roa-rup.wikipedia.org/wiki/${title}
+tmnet=http://www.technomanifestos.net/?${title}
+gmailwiki=http://www.gmailwiki.com/index.php/${title}
+plog4u=http://plog4u.org/index.php/${title}
+googlegroups=http://groups.google.com/groups?q=${title}
+wikiworld=http://WikiWorld.com/wiki/index.php/${title}
+qu=http://qu.wikipedia.org/wiki/${title}
+consciousness=http://teadvus.inspiral.org/
+eljwiki=http://elj.sourceforge.net/phpwiki/index.php/${title}
+lojban=http://www.lojban.org/tiki/tiki-index.php?page=${title}
+usej=http://www.tejo.org/usej/${title}
+tokipona=http://tokipona.wikipedia.org/wiki/${title}
+mathsongswiki=http://SeedWiki.com/page.cfm?wikiid=237&doc=${title}
+got=http://got.wikipedia.org/wiki/${title}
+shakti=http://cgi.algonet.se/htbin/cgiwrap/pgd/ShaktiWiki/${title}
+memoryalpha=http://www.memory-alpha.org/en/index.php/${title}
+cliki=http://ww.telent.net/cliki/${title} 
+pt=http://pt.wikipedia.org/wiki/${title}
+fr.ca=http://fr.ca.wikinations.org/${title}
+ps=http://ps.wikipedia.org/wiki/${title}
+fur=http://fur.wikipedia.org/wiki/${title}
+wikicities=http://www.wikicities.com/index.php/${title}
+pl=http://pl.wikipedia.org/wiki/${title}
+pi=http://pi.wikipedia.org/wiki/${title}
+wiktionary=http://en.wiktionary.org/wiki/${title}
+turismo=http://www.tejo.org/turismo/${title}
+pa=http://pa.wikipedia.org/wiki/${title}
+terrorwiki=http://www.liberalsagainstterrorism.com/wiki/index.php/${title}
+finalempire=http://final-empire.sourceforge.net/cgi-bin/wiki.pl?${title}
+fr.be=http://fr.wikinations.be/${title}
+os=http://os.wikipedia.org/wiki/${title}
+or=http://or.wikipedia.org/wiki/${title}
+netvillage=http://www.netbros.com/?${title}
+seattlewireless=http://seattlewireless.net/?${title}
+om=http://om.wikipedia.org/wiki/${title}
+pangalacticorg=http://www.pangalactic.org/Wiki/${title}
+seeds=http://www.IslandSeeds.org/wiki/${title}
+oc=http://oc.wikipedia.org/wiki/${title}
+raec=http://www.raec.clacso.edu.ar:8080/raec/Members/raecpedia/${title}
+ny=http://ny.wikipedia.org/wiki/${title}
+nv=http://nv.wikipedia.org/wiki/${title}
+foldoc=http://www.foldoc.org/foldoc/foldoc.cgi?${title}
+no=http://no.wikipedia.org/wiki/${title}
+nn=http://nn.wikipedia.org/wiki/${title}
+metawikipedia=http://meta.wikimedia.org/wiki/${title}
+wikif1=http://www.wikif1.org/${title}
+nl=http://nl.wikipedia.org/wiki/${title}
+ypsieyeball=http://sknkwrks.dyndns.org:1957/writewiki/wiki.pl?${title}
+ng=http://ng.wikipedia.org/wiki/${title}
+purlnet=http://purl.oclc.org/NET/${title}
+ne=http://ne.wikipedia.org/wiki/${title}
+nb=http://nb.wikipedia.org/wiki/${title}
+abbenormal=http://www.ourpla.net/cgi-bin/pikie.cgi?${title}
+na=http://na.wikipedia.org/wiki/${title}
+docbook=http://docbook.org/wiki/moin.cgi/${title}
+fr.org=http://fr.wikinations.org/${title}
+my=http://my.wikipedia.org/wiki/${title}
+brasilwiki=http://rio.ifi.unizh.ch/brasilienwiki/index.php/${title}
+mt=http://mt.wikipedia.org/wiki/${title}
+ms=http://ms.wikipedia.org/wiki/${title}
+mr=http://mr.wikipedia.org/wiki/${title}
+advogato=http://www.advogato.org/${title}
+senseislibrary=http://senseis.xmp.net/?${title}
+mo=http://mo.wikipedia.org/wiki/${title}
+mn=http://mn.wikipedia.org/wiki/${title}
+lutherwiki=http://www.lutheranarchives.com/mw/index.php/${title}
+ml=http://ml.wikipedia.org/wiki/${title}
+mk=http://mk.wikipedia.org/wiki/${title}
+mi=http://mi.wikipedia.org/wiki/${title}
+jspwiki=http://www.ecyrd.com/JSPWiki/Wiki.jsp?page=${title}
+mh=http://mh.wikipedia.org/wiki/${title}
+mg=http://mg.wikipedia.org/wiki/${title}
+metaweb=http://www.metaweb.com/wiki/wiki.phtml?title=${title}
+kmwiki=http://www.voght.com/cgi-bin/pywiki?${title}
+efnetxmlwiki=http://purl.net/wiki/xml/${title}
+tejo=http://www.tejo.org/vikio/${title}
+zwiki=http://www.zwiki.org/${title}
+lv=http://lv.wikipedia.org/wiki/${title}
+lt=http://lt.wikipedia.org/wiki/${title}
+lo=http://lo.wikipedia.org/wiki/${title}
+foxwiki=http://fox.wikis.com/wc.dll?Wiki~${title}
+ln=http://ln.wikipedia.org/wiki/${title}
+emacswiki=http://www.emacswiki.org/cgi-bin/wiki.pl?${title}
+li=http://li.wikipedia.org/wiki/${title}
+bemi=http://bemi.free.fr/vikio/index.php?${title}
+lg=http://lg.wikipedia.org/wiki/${title}
+wikibooks=http://en.wikibooks.org/wiki/${title}
+lb=http://lb.wikipedia.org/wiki/${title}
+la=http://la.wikipedia.org/wiki/${title}
+creationmatters=http://www.ourpla.net/cgi-bin/wiki.pl?${title}
+ky=http://ky.wikipedia.org/wiki/${title}
+kw=http://kw.wikipedia.org/wiki/${title}
+kv=http://kv.wikipedia.org/wiki/${title}
+pikie=http://pikie.darktech.org/cgi/pikie?${title}
+evowiki=http://www.evowiki.org/index.php/${title}
+ku=http://ku.wikipedia.org/wiki/${title}
+ks=http://ks.wikipedia.org/wiki/${title}
+kr=http://kr.wikipedia.org/wiki/${title}
+haribeau=http://wiki.haribeau.de/cgi-bin/wiki.pl?${title}
+ko=http://ko.wikipedia.org/wiki/${title}
+kn=http://kn.wikipedia.org/wiki/${title}
+km=http://km.wikipedia.org/wiki/${title}
+kl=http://kl.wikipedia.org/wiki/${title}
+kk=http://kk.wikipedia.org/wiki/${title}
+kj=http://kj.wikipedia.org/wiki/${title}
+ki=http://ki.wikipedia.org/wiki/${title}
+why=http://clublet.com/c/c/why?${title}
+kg=http://kg.wikipedia.org/wiki/${title}
+ka=http://ka.wikipedia.org/wiki/${title}
+mus=http://mus.wikipedia.org/wiki/${title}
+hrwiki=http://www.hrwiki.org/index.php/${title}
+orgpatterns=http://www.bell-labs.com/cgi-user/OrgPatterns/OrgPatterns?${title}
+jv=http://jv.wikipedia.org/wiki/${title}
+gotamac=http://www.got-a-mac.org/${title}
+dolphinwiki=http://www.object-arts.com/wiki/html/Dolphin/${title}
+zh-cn=http://zh.wikipedia.org/wiki/${title}
+visualworks=http://wiki.cs.uiuc.edu/VisualWorks/${title}
+iawiki=http://www.IAwiki.net/${title}
+freebsdman=http://www.FreeBSD.org/cgi/man.cgi?apropos=1&query=${title}
+ja=http://ja.wikipedia.org/wiki/${title}
+chy=http://chy.wikipedia.org/wiki/${title}
+unreal=http://wiki.beyondunreal.com/wiki/${title}
+iu=http://iu.wikipedia.org/wiki/${title}
+it=http://it.wikipedia.org/wiki/${title}
+is=http://is.wikipedia.org/wiki/${title}
+chr=http://chr.wikipedia.org/wiki/${title}
+usemod=http://www.usemod.com/cgi-bin/wiki.pl?${title}
+cmwiki=http://www.ourpla.net/cgi-bin/wiki.pl?${title}
+hammondwiki=http://www.dairiki.org/HammondWiki/index.php3?${title}
+cho=http://cho.wikipedia.org/wiki/${title}
+io=http://io.wikipedia.org/wiki/${title}
+personaltelco=http://www.personaltelco.net/index.cgi/${title}
+ik=http://ik.wikipedia.org/wiki/${title}
+haw=http://haw.wikipedia.org/wiki/${title}
+ii=http://ii.wikipedia.org/wiki/${title}
+wikisource=http://sources.wikipedia.org/wiki/${title}
+lugkr=http://lug-kr.sourceforge.net/cgi-bin/lugwiki.pl?${title}
+ig=http://ig.wikipedia.org/wiki/${title}
+zh-cfr=http://zh-min-nan.wikipedia.org/wiki/${title}
+ie=http://ie.wikipedia.org/wiki/${title}
+id=http://id.wikipedia.org/wiki/${title}
+ia=http://ia.wikipedia.org/wiki/${title}
+openwiki=http://openwiki.com/?${title}
+hz=http://hz.wikipedia.org/wiki/${title}
+hy=http://hy.wikipedia.org/wiki/${title}
+strikiwiki=http://ch.twi.tudelft.nl/~mostert/striki/teststriki.pl?${title}
+hu=http://hu.wikipedia.org/wiki/${title}
+herzkinderwiki=http://www.herzkinderinfo.de/Mediawiki/index.php/${title}
+ht=http://ht.wikipedia.org/wiki/${title}
+hr=http://hr.wikipedia.org/wiki/${title}
+webisodes=http://www.webisodes.org/${title}
+globalvoices=http://cyber.law.harvard.edu/dyn/globalvoices/wiki/${title}
+ho=http://ho.wikipedia.org/wiki/${title}
+hi=http://hi.wikipedia.org/wiki/${title}
+elibre=http://enciclopedia.us.es/index.php/${title}
+alife=http://news.alife.org/wiki/index.php?${title}
+he=http://he.wikipedia.org/wiki/${title}
+ast=http://ast.wikipedia.org/wiki/${title}
+ha=http://ha.wikipedia.org/wiki/${title}
+revo=http://purl.org/NET/voko/revo/art/${title}.html
+arxiv=http://www.arxiv.org/abs/${title}
+sockwiki=http://wiki.socklabs.com/${title}
+gv=http://gv.wikipedia.org/wiki/${title}
+gu=http://gu.wikipedia.org/wiki/${title}
+gn=http://gn.wikipedia.org/wiki/${title}
+gl=http://gl.wikipedia.org/wiki/${title}
+seapig=http://www.seapig.org/${title}
+gd=http://gd.wikipedia.org/wiki/${title}
+ga=http://ga.wikipedia.org/wiki/${title}
+opera7wiki=http://nontroppo.org/wiki/${title}
+oeis=http://www.research.att.com/cgi-bin/access.cgi/as/njas/sequences/eisA.cgi?Anum=${title}
+moinmoin=http://purl.net/wiki/moin/${title}
+fy=http://fy.wikipedia.org/wiki/${title}
+gej=http://www.esperanto.de/cgi-bin/aktivikio/wiki.pl?${title}
+fr=http://fr.wikipedia.org/wiki/${title}
+arc=http://arc.wikipedia.org/wiki/${title}
+fo=http://fo.wikipedia.org/wiki/${title}
+fj=http://fj.wikipedia.org/wiki/${title}
+wikinews=http://en.wikinews.org/wiki/${title}
+fi=http://fi.wikipedia.org/wiki/${title}
+ff=http://ff.wikipedia.org/wiki/${title}
+annotationwiki=http://www.seedwiki.com/page.cfm?wikiid=368&doc=${title}
+sep11=http://sep11.wikipedia.org/wiki/${title}
+wlug=http://www.wlug.org.nz/${title}
+fa=http://fa.wikipedia.org/wiki/${title}
+eu=http://eu.wikipedia.org/wiki/${title}
+tmbw=http://www.tmbw.net/wiki/index.php/${title}
+et=http://et.wikipedia.org/wiki/${title}
+scn=http://scn.wikipedia.org/wiki/${title}
+es=http://es.wikipedia.org/wiki/${title}
+muweb=http://www.dunstable.com/scripts/MuWebWeb?${title}
+eo=http://eo.wikipedia.org/wiki/${title}
+en=http://en.wikipedia.org/wiki/${title}
+dejanews=http://www.deja.com/=dnc/getdoc.xp?AN=${title}
+el=http://el.wikipedia.org/wiki/${title}
+jargonfile=http://sunir.org/apps/meta.pl?wiki=JargonFile&redirect=${title}
+eokulturcentro=http://esperanto.toulouse.free.fr/wakka.php?wiki=${title}
+ee=http://ee.wikipedia.org/wiki/${title}
+tum=http://tum.wikipedia.org/wiki/${title}
+plog4u_de=http://plog4u.de/index.php/${title}
+dz=http://dz.wikipedia.org/wiki/${title}
+dv=http://dv.wikipedia.org/wiki/${title}
+kerimwiki=http://wiki.oxus.net/${title}
+dk=http://da.wikipedia.org/wiki/${title}
+de=http://de.wikipedia.org/wiki/${title}
+dwjwiki=http://www.suberic.net/cgi-bin/dwj/wiki.cgi?${title}
+da=http://da.wikipedia.org/wiki/${title}
+wlwiki=http://winslowslair.supremepixels.net/wiki/index.php/${title}
+cy=http://cy.wikipedia.org/wiki/${title}
+w=http://en.wikipedia.org/wiki/${title}
+cv=http://cv.wikipedia.org/wiki/${title}
+cs=http://cs.wikipedia.org/wiki/${title}
+cr=http://cr.wikipedia.org/wiki/${title}
+q=http://en.wikiquote.org/wiki/${title}
+co=http://co.wikipedia.org/wiki/${title}
+zh-min-nan=http://zh-min-nan.wikipedia.org/wiki/${title}
+n=http://en.wikinews.org/wiki/${title} 
+m=http://meta.wikimedia.org/wiki/${title}
+annotation=http://bayle.stanford.edu/crit/nph-med.cgi/${title}
+ch=http://ch.wikipedia.org/wiki/${title}
+efnetcppwiki=http://purl.net/wiki/cpp/${title}
+ce=http://ce.wikipedia.org/wiki/${title}
+c2find=http://c2.com/cgi/wiki?FindPage&value=${title} 
+b=http://en.wikibooks.org/wiki/${title}
+ca=http://ca.wikipedia.org/wiki/${title}
+dictionary=http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=${title}
+ang=http://ang.wikipedia.org/wiki/${title}
+zh-tw=http://zh.wikipedia.org/wiki/${title}
+bs=http://bs.wikipedia.org/wiki/${title}
+br=http://br.wikipedia.org/wiki/${title}
+twiki=http://twiki.org/cgi-bin/view/${title}
+bo=http://bo.wikipedia.org/wiki/${title}
+wikt=http://en.wiktionary.org/wiki/${title}
+bn=http://bn.wikipedia.org/wiki/${title}
+bm=http://bm.wikipedia.org/wiki/${title}
+bi=http://bi.wikipedia.org/wiki/${title}
+bh=http://bh.wikipedia.org/wiki/${title}
+bg=http://bg.wikipedia.org/wiki/${title}
+knowhow=http://www2.iro.umontreal.ca/~paquetse/cgi-bin/wiki.cgi?${title}
+be=http://be.wikipedia.org/wiki/${title}
+wiki=http://c2.com/cgi/wiki?${title}
+patwiki=http://gauss.ffii.org/${title}
+ba=http://ba.wikipedia.org/wiki/${title}
+rfc=http://www.rfc-editor.org/rfc/rfc${title}.txt
+zu=http://zu.wikipedia.org/wiki/${title}
+lanifexwiki=http://opt.lanifex.com/cgi-bin/wiki.pl?${title}
+twistedwiki=http://purl.net/wiki/twisted/${title}
+az=http://az.wikipedia.org/wiki/${title}
+ay=http://ay.wikipedia.org/wiki/${title}
+commons=http://commons.wikimedia.org/wiki/${title}
+acronym=http://www.acronymfinder.com/af-query.asp?String=exact&Acronym=${title}
+av=http://av.wikipedia.org/wiki/${title}
+aspienetwiki=http://aspie.mela.de/Wiki/index.php?title=${title}
+as=http://as.wikipedia.org/wiki/${title}
+metawiki=http://sunir.org/apps/meta.pl?${title}
+ar=http://ar.wikipedia.org/wiki/${title}
+zh=http://zh.wikipedia.org/wiki/${title}
+pywiki=http://www.voght.com/cgi-bin/pywiki?${title}
+an=http://an.wikipedia.org/wiki/${title}
+am=http://am.wikipedia.org/wiki/${title}
+ak=http://ak.wikipedia.org/wiki/${title}
+infosecpedia=http://www.infosecpedia.org/pedia/index.php/${title}
+za=http://za.wikipedia.org/wiki/${title}
+af=http://af.wikipedia.org/wiki/${title}
+firstwiki=http://firstwiki.org/index.php/${title}
+als=http://als.wikipedia.org/wiki/${title}
+ab=http://ab.wikipedia.org/wiki/${title}
+aa=http://aa.wikipedia.org/wiki/${title}
+ursine=http://ursine.ca/${title}
+meatball=http://www.usemod.com/cgi-bin/mb.pl?${title}
+mozillawiki=http://wiki.mozilla.org/index.php/${title}
+imdb=http://us.imdb.com/Title?${title}
+pythoninfo=http://www.python.org/cgi-bin/moinmoin/${title}
+yo=http://yo.wikipedia.org/wiki/${title}
+seattlewiki=http://seattlewiki.org/wiki/${title}
+yi=http://yi.wikipedia.org/wiki/${title}
+vls=http://vls.wikipedia.org/wiki/${title}
+meta=http://meta.wikimedia.org/wiki/${title}
+susning=http://www.susning.nu/${title}
+nds=http://nds.wikipedia.org/wiki/${title}
+wikitravel=http://wikitravel.org/en/${title}
+codersbase=http://www.codersbase.com/${title}
+tpi=http://tpi.wikipedia.org/wiki/${title}
+ppr=http://c2.com/cgi/wiki?${title}
\ No newline at end of file
diff --git a/resources/operators.txt b/resources/operators.txt
new file mode 100644
index 0000000..7d9835d
--- /dev/null
+++ b/resources/operators.txt
@@ -0,0 +1,27 @@
+pre,-,PreMinus,4600
+pre,+,PrePlus,4600
+pre,not,Not,4600
+#
+in,^,Pow,3700
+#
+in,*,Times,3800
+in,/,Divide,3800
+in,div,Divide,3800
+in,mod,Mod,3800
+#
+in,+,Plus,2900
+in,-,Subtract,2900
+#
+in,round,Round,2800
+#
+in,=,Equal,2600
+in,!=,Unequal,2600
+in,<>,Unequal,2600
+in,>,Greater,2600
+in,>=,GreaterEqual,2600
+in,<,Less,2600
+in,<=,LessEqual,2600
+#
+in,and,And,2000
+#
+in,or,Or,1900
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala
index d29aaa0..b4012e9 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala
@@ -17,13 +17,13 @@ case class ConllDoc(val docID: String,
                     val trees: Seq[DepConstTree],
                     val nerChunks: Seq[Seq[Chunk[String]]],
                     val corefChunks: Seq[Seq[Chunk[Int]]],
-                    val speakers: Seq[Seq[String]]) {
+                    val speakers: Seq[Seq[String]]) extends Document {
   
-  val numSents = words.size;
+  override val numSents = words.size;
   
-  def uid = docID -> docPartNo;
+  override def uid = docID -> docPartNo;
   
-  def fileName = {
+  override def fileName = {
     if (docID.contains("/")) {
       docID.substring(docID.lastIndexOf("/") + 1);
     } else {
@@ -31,11 +31,11 @@ case class ConllDoc(val docID: String,
     }
   }
   
-  def printableDocName = docID + " (part " + docPartNo + ")";
+  override def printableDocName = docID + " (part " + docPartNo + ")";
   
-  def isConversation = docID.startsWith("bc") || docID.startsWith("wb");
-  
-  def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = ConllDoc.getCorrespondingNERChunk(nerChunks(sentIdx), headIdx);
+  override def isConversation = docID.startsWith("bc") || docID.startsWith("wb")
+
+  override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = ConllDoc.getCorrespondingNERChunk(nerChunks(sentIdx), headIdx);
 }
 
 object ConllDoc {
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
index 91685f3..299fe02 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
@@ -38,7 +38,7 @@ class ConllDocReader(val lang: Language,
     case _ => throw new RuntimeException("Bad language, no head finder for " + lang);
   }
   
-  def readConllDocs(fileName: String): Seq[ConllDoc] = {
+  def readConllDocs(fileName: String): Seq[Document] = {
     val fcn = (docID: String, docPartNo: Int, docBySentencesByLines: ArrayBuffer[ArrayBuffer[String]]) => assembleConllDoc(docBySentencesByLines, docID, docPartNo);
     ConllDocReader.readConllDocsGeneral(fileName, fcn);
   }
@@ -283,7 +283,7 @@ object ConllDocReader {
 //    loadRawConllDocsWithSuffix(path, size, if (gold) "gold_conll" else "auto_conll", lang, betterParsesFile);
 //  }
   
-  def loadRawConllDocsWithSuffix(path: String, size: Int, suffix: String, lang: Language = Language.ENGLISH, betterParsesFile: String = ""): Seq[ConllDoc] = {
+  def loadRawConllDocsWithSuffix(path: String, size: Int, suffix: String, lang: Language = Language.ENGLISH, betterParsesFile: String = ""): Seq[Document] = {
     Logger.logss("Loading " + size + " docs from " + path + " ending with " + suffix);
     val rawDir = new File(path);
     if (!rawDir.exists() || !rawDir.canRead() || rawDir.listFiles == null || rawDir.listFiles.isEmpty) {
@@ -292,13 +292,22 @@ object ConllDocReader {
     val rawFiles = rawDir.listFiles.sortBy(_.getAbsolutePath());
     val files = rawFiles.filter(file => file.getAbsolutePath.endsWith(suffix));
     val reader = new ConllDocReader(lang, betterParsesFile);
-    val docs = new ArrayBuffer[ConllDoc];
+    val docs = new ArrayBuffer[Document];
     var docCounter = 0;
     var fileIdx = 0;
     while (fileIdx < files.size && (size == -1 || docCounter < size)) {
-      val newDocs = reader.readConllDocs(files(fileIdx).getAbsolutePath);
-      docs ++= newDocs;
-      docCounter += newDocs.size
+      val pp = files(fileIdx).getAbsolutePath
+      try {
+        Logger.logss("Loading doc: " + pp)
+        val newDocs = reader.readConllDocs(pp);
+        docs ++= newDocs;
+        docCounter += newDocs.size
+      } catch {
+        case e : Exception => {
+          Logger.logss("failed document "+pp)
+          e.printStackTrace(System.err)
+        }
+      }
       fileIdx += 1;
     }
     val numDocs = if (size == -1) docs.size else Math.min(size, files.size);
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala
index 395a268..422a694 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala
@@ -16,7 +16,7 @@ import edu.berkeley.nlp.entity.wiki.WikiAnnotReaderWriter
 
 object ConllDocWriter {
 
-  def writeDoc(writer: PrintWriter, conllDoc: ConllDoc, clustering: OrderedClusteringBound) {
+  def writeDoc(writer: PrintWriter, conllDoc: Document, clustering: OrderedClusteringBound) {
     writeIncompleteConllDoc(writer, conllDoc.docID, conllDoc.docPartNo, conllDoc.words, conllDoc.pos, conllDoc.trees.map(_.constTree), conllDoc.speakers, conllDoc.nerChunks, convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size));
 //    val corefBits = getCorefBits(conllDoc.words.map(_.size), convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size));
 //    val numZeroesToAddToPartNo = 3 - conllDoc.docPartNo.toString.size;
@@ -35,7 +35,7 @@ object ConllDocWriter {
   }
   
   def writeDocWithPredAnnotations(writer: PrintWriter,
-                                  conllDoc: ConllDoc,
+                                  conllDoc: Document,
                                   nerChunks: Seq[Seq[Chunk[String]]],
                                   corefClustering: OrderedClusteringBound,
                                   wikiChunks: Option[Seq[Seq[Chunk[String]]]] = None) {
@@ -45,7 +45,7 @@ object ConllDocWriter {
   
   def writeDocWithPredAnnotationsWikiStandoff(writer: PrintWriter,
                                               standoffWriter: PrintWriter,
-                                              conllDoc: ConllDoc,
+                                              conllDoc: Document,
                                               nerChunks: Seq[Seq[Chunk[String]]],
                                               corefClustering: OrderedClusteringBound,
                                               wikiChunks: Seq[Seq[Chunk[String]]]) {
@@ -54,7 +54,7 @@ object ConllDocWriter {
   }
   
   def writeIncompleteConllDoc(writer: PrintWriter,
-                              doc: ConllDoc) {
+                              doc: Document) {
     writeIncompleteConllDocNestedNER(writer, doc.docID, doc.docPartNo, doc.words, doc.pos, doc.trees.map(_.constTree), doc.speakers, doc.nerChunks, doc.corefChunks);
   }
   
@@ -210,7 +210,7 @@ object ConllDocWriter {
     }
   }
   
-  def writeDocIllinoisColumnFormat(writer: PrintWriter, conllDoc: ConllDoc) {
+  def writeDocIllinoisColumnFormat(writer: PrintWriter, conllDoc: Document) {
     writer.println("O\t0\t0\tO\t-X-\t-DOCSTART-\tx\tx\t0");
     
 //    B-LOC   0       0       I-NP    NNP     Portugal        x       x       0
diff --git a/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala b/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala
index 641cd4c..31a0d06 100644
--- a/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala
@@ -16,10 +16,11 @@ import java.util.Collections
 import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder
 import edu.berkeley.nlp.futile.ling.CollinsHeadFinder
 
+@SerialVersionUID(1L)
 class DepConstTree(val constTree: Tree[String],
                    val pos: Seq[String],
                    val words: Seq[String],
-                   val childParentDepMap: HashMap[Int,Int]) {
+                   val childParentDepMap: HashMap[Int,Int]) extends Serializable {
   require(childParentDepMap.keys.toSeq.sorted.sameElements((0 until words.size)), PennTreeRenderer.render(constTree));
   
   def size = words.size;
diff --git a/src/main/java/edu/berkeley/nlp/entity/Document.scala b/src/main/java/edu/berkeley/nlp/entity/Document.scala
new file mode 100644
index 0000000..cf95766
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/Document.scala
@@ -0,0 +1,36 @@
+package edu.berkeley.nlp.entity
+
+/**
+ * Created by matthew on 2/18/15.
+ */
+trait Document {
+  def docID : String
+  def docPartNo : Int
+  // arrays of words in each sentence including punc
+  def words : Seq[Seq[String]]
+  // the grammar types of the words
+  def pos : Seq[Seq[String]]
+  // parse trees of each sentence
+  def trees : Seq[DepConstTree]
+  // I am guessing the type of the chunk eg: ORG-NAM
+  def nerChunks : Seq[Seq[Chunk[String]]]
+  // have ranges and identifiers for the unique item that they are referencing
+  // appears [start, end)
+  def corefChunks : Seq[Seq[Chunk[Int]]]
+  // just use "-" for each in the case that the speaker is unknown
+  def speakers : Seq[Seq[String]]
+
+  def numSents : Int = -1
+
+  def uid : (String, Int) = docID -> docPartNo
+
+  def fileName : String
+
+  def printableDocName : String
+
+  def isConversation : Boolean = false
+
+  def getCorrespondingNERChunk (sentIdx : Int, headIdx : Int) : Option[Chunk[String]]
+
+  var documentVectorCache: Array[Int] = null
+}
diff --git a/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala b/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala
index 1fad8ce..2bf9fa3 100644
--- a/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala
@@ -153,7 +153,7 @@ object EntitySystem {
        ConllDocReader.loadRawConllDocsWithSuffix(goldPath, size, goldSuffix));
     } else {
       (ConllDocReader.loadRawConllDocsWithSuffix(path, size, suffix),
-       new ArrayBuffer[ConllDoc]());
+       new ArrayBuffer[Document]());
     }
     val goldWikification = new HashMap[String,HashMap[Int,ArrayBuffer[Chunk[String]]]];
     val assembler = CorefDocAssembler(Driver.lang, Driver.useGoldMentions);
diff --git a/src/main/java/edu/berkeley/nlp/entity/GUtil.scala b/src/main/java/edu/berkeley/nlp/entity/GUtil.scala
index 803cd6d..8031560 100644
--- a/src/main/java/edu/berkeley/nlp/entity/GUtil.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/GUtil.scala
@@ -406,7 +406,7 @@ object GUtil {
   def argMaxIdxFloat(values: Seq[Float]) = {
     var currIdx = 0;
     var maxIdx = 0;
-    var maxVal = Double.NegativeInfinity;
+    var maxVal = Float.NegativeInfinity;
     while (currIdx < values.size) {
       if (values(currIdx) > maxVal) {
         maxIdx = currIdx;
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
new file mode 100644
index 0000000..fc1ab62
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
@@ -0,0 +1,37 @@
+package edu.berkeley.nlp.entity
+
+/**
+ * Created by matthew on 2/18/15.
+ */
+@SerialVersionUID(1L)
+case class WikiDoc (docID : String,
+                    docPartNo : Int,
+                    words : Seq[Seq[String]],
+                    pos : Seq[Seq[String]],
+                    trees: Seq[DepConstTree],
+                    nerChunks : Seq[Seq[Chunk[String]]],
+                    corefChunks : Seq[Seq[Chunk[Int]]],
+                    speakers : Seq[Seq[String]],
+                    wikiRefChunks : Seq[Seq[Chunk[String]]] ) extends Document {
+
+  override val numSents = words.size;
+
+  override def uid = docID -> docPartNo;
+
+  override def fileName = {
+    if (docID.contains("/")) {
+      docID.substring(docID.lastIndexOf("/") + 1);
+    } else {
+      docID;
+    }
+  }
+
+  override def printableDocName = docID + " (part " + docPartNo + ")";
+
+  override def isConversation = docID.startsWith("bc") || docID.startsWith("wb")
+
+  override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = None;
+
+  //override def corefChunks = throw new NotImplementedError()
+
+}
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
new file mode 100644
index 0000000..2c2f6d8
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
@@ -0,0 +1,24 @@
+package edu.berkeley.nlp.entity
+
+import java.io.File
+
+import edu.berkeley.nlp.entity.lang.Language
+
+import scala.collection.mutable.ArrayBuffer
+
+/**
+ * Created by matthew on 2/18/15.
+ */
+class WikiDocReader (lang : Language, better : String) {} // TODO: remove
+
+object WikiDocReader {
+  def loadRawWikiDocs(path : String, size : Int, suffix : String, lang : Language = Language.ENGLISH, betterParsesFile : String = "") : Seq[Document] = {
+
+    var docs = GUtil.load(path).asInstanceOf[List[WikiDoc]]
+
+    if(size != -1 && docs.size > size)
+      docs.map(_.asInstanceOf[Document]).slice(0, size).toSeq
+    else
+      docs.map(_.asInstanceOf[Document]).toSeq
+  }
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala
index bfd8b14..ee9b457 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala
@@ -9,22 +9,22 @@ import scala.sys.process.stringSeqToProcess
 import scala.sys.process.Process
 import edu.berkeley.nlp.futile.util.Logger
 import edu.berkeley.nlp.entity.Driver
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.ConllDocWriter
 
 class CorefConllScorer(val conllEvalScriptPath: String) {
   
-  def renderFinalScore(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound]) = {
+  def renderFinalScore(conllDocs: Seq[Document], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound]) = {
     val summary = score(conllDocs, rawPredClusterings, goldClusterings, true);
     CorefConllScorer.processConllString(summary, false);
   }
   
-  def renderSuffStats(conllDoc: ConllDoc, rawPredClustering: OrderedClusteringBound, goldClustering: OrderedClusteringBound) = {
+  def renderSuffStats(conllDoc: Document, rawPredClustering: OrderedClusteringBound, goldClustering: OrderedClusteringBound) = {
     val summary = score(Seq(conllDoc), Seq(rawPredClustering), Seq(goldClustering), false);
     CorefConllScorer.processConllString(summary, true);
   }
   
-  def score(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound], saveTempFiles: Boolean) = {
+  def score(conllDocs: Seq[Document], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound], saveTempFiles: Boolean) = {
     val predClusterings = if (Driver.doConllPostprocessing) rawPredClusterings.map(_.postprocessForConll()) else rawPredClusterings;
 //    var predFile = File.createTempFile("temp", ".conll");
     val (predFile, goldFile) = if (Driver.conllOutputDir != "" && saveTempFiles) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala
index f7cc4b6..f5634fb 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala
@@ -10,9 +10,9 @@ import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
 import edu.berkeley.nlp.futile.util.Counter
 import edu.berkeley.nlp.futile.util.Logger
 import edu.berkeley.nlp.entity.GUtil
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 
-case class CorefDoc(val rawDoc: ConllDoc,
+case class CorefDoc(val rawDoc: Document,
                     val goldMentions: Seq[Mention],
                     val goldClustering: OrderedClustering,
                     val predMentions: Seq[Mention]) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala
index 9c369e3..413e1cd 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala
@@ -12,17 +12,17 @@ import edu.berkeley.nlp.entity.lang.ChineseCorefLanguagePack
 import edu.berkeley.nlp.entity.lang.ArabicCorefLanguagePack
 import edu.berkeley.nlp.futile.util.Counter
 import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 
 case class ProtoMention(val sentIdx: Int, val startIdx: Int, val endIdx: Int, val headIdx: Int);
 case class ProtoMentionFancy(val sentIdx: Int, val startIdx: Int, val endIdx: Int, val headIndices: Seq[Int]);
 
-case class ProtoCorefDoc(val doc: ConllDoc, val goldMentions: Seq[Mention], val predProtoMentions: Seq[ProtoMention]);
+case class ProtoCorefDoc(val doc: Document, val goldMentions: Seq[Mention], val predProtoMentions: Seq[ProtoMention]);
 
 class CorefDocAssembler(val langPack: CorefLanguagePack,
                         val useGoldMentions: Boolean) {
   
-  def createCorefDoc(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = {
+  def createCorefDoc(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = {
     val (goldMentions, goldClustering) = extractGoldMentions(rawDoc, propertyComputer);
     if (goldMentions.size == 0) {
       Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
@@ -31,7 +31,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions)
   }
   
-  def createCorefDocFancy(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, possibleChunks: Seq[Seq[Chunk[Boolean]]]): CorefDoc = {
+  def createCorefDocFancy(rawDoc: Document, propertyComputer: MentionPropertyComputer, possibleChunks: Seq[Seq[Chunk[Boolean]]]): CorefDoc = {
     val (goldMentions, goldClustering) = extractGoldMentions(rawDoc, propertyComputer);
     if (goldMentions.size == 0) {
       Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
@@ -41,11 +41,11 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions)
   }
   
-  def extractGoldMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
+  def extractGoldMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
     CorefDocAssembler.extractGoldMentions(rawDoc, propertyComputer, langPack);
   }
   
-  def extractPredMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
+  def extractPredMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
     val protoMentionsSorted = getProtoMentionsSorted(rawDoc, gms);
     val finalMentions = new ArrayBuffer[Mention]();
     for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) {
@@ -54,7 +54,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     finalMentions;
   }
   
-  def extractPredMentionsFancy(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Mention] = {
+  def extractPredMentionsFancy(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Mention] = {
     val protoMentionsSorted = getProtoMentionsSortedFancy(rawDoc, gms, possibleChunks);
     val finalMentions = new ArrayBuffer[Mention]();
     for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) {
@@ -63,7 +63,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     finalMentions;
   }
   
-  private def getProtoMentionsSorted(rawDoc: ConllDoc, gms: Seq[Mention]): Seq[Seq[ProtoMention]] = {
+  private def getProtoMentionsSorted(rawDoc: Document, gms: Seq[Mention]): Seq[Seq[ProtoMention]] = {
     val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMention]);
     for (sentIdx <- 0 until rawDoc.numSents) {
       // Extract NE spans: filter out O, QUANTITY, CARDINAL, CHUNK
@@ -131,7 +131,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
 //    }
   }
   
-  private def getProtoMentionsSortedFancy(rawDoc: ConllDoc, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Seq[ProtoMention]] = {
+  private def getProtoMentionsSortedFancy(rawDoc: Document, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Seq[ProtoMention]] = {
     val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMention]);
     for (sentIdx <- 0 until rawDoc.numSents) {
       // Extract NPs and PRPs *except* for those contained in NE chunks (the NE tagger seems more reliable than the parser)
@@ -154,7 +154,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     }
   }
   
-  private def filterNonMaximalNPs(rawDoc: ConllDoc, mentionExtents: Seq[HashSet[ProtoMention]]) = {
+  private def filterNonMaximalNPs(rawDoc: Document, mentionExtents: Seq[HashSet[ProtoMention]]) = {
     val filteredProtoMentionsSorted = (0 until rawDoc.numSents).map(i => new ArrayBuffer[ProtoMention]);
     for (sentIdx <- 0 until mentionExtents.size) {
       val protoMentionsByHead = mentionExtents(sentIdx).groupBy(_.headIdx);
@@ -211,7 +211,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
   //////////////////
   
   
-  def createCorefDocWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = {
+  def createCorefDocWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = {
     val (goldMentions, goldClustering) = extractGoldMentionsWithCoordination(rawDoc, propertyComputer);
     if (goldMentions.size == 0) {
       Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
@@ -220,7 +220,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions)
   }
   
-  def extractGoldMentionsWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
+  def extractGoldMentionsWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
     val goldProtoMentionsSorted = getGoldProtoMentionsSortedWithCoordination(rawDoc);
     val finalMentions = new ArrayBuffer[Mention]();
     val goldClusterLabels = new ArrayBuffer[Int]();
@@ -238,7 +238,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     (finalMentions, OrderedClustering.createFromClusterIds(goldClusterLabels));
   }
   
-  def extractPredMentionsWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
+  def extractPredMentionsWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
     val protoMentionsSorted = getProtoMentionsSortedWithCoordination(rawDoc, gms);
     val finalMentions = new ArrayBuffer[Mention]();
     for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) {
@@ -247,7 +247,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     finalMentions;
   }
   
-  private def getGoldProtoMentionsSortedWithCoordination(rawDoc: ConllDoc): Seq[Seq[ProtoMentionFancy]] = {
+  private def getGoldProtoMentionsSortedWithCoordination(rawDoc: Document): Seq[Seq[ProtoMentionFancy]] = {
     val goldProtoMentions = for (sentIdx <- 0 until rawDoc.corefChunks.size) yield {
        for (chunk <- rawDoc.corefChunks(sentIdx)) yield {
          val headIndices = rawDoc.trees(sentIdx).getSpanHeadOrNPCoordinatedHeads(chunk.start, chunk.end);
@@ -257,7 +257,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     goldProtoMentions.map(_.sortBy(ment => (ment.sentIdx, ment.headIndices.head, ment.endIdx, ment.startIdx)));
   }
   
-  private def getProtoMentionsSortedWithCoordination(rawDoc: ConllDoc, gms: Seq[Mention]): Seq[Seq[ProtoMentionFancy]] = {
+  private def getProtoMentionsSortedWithCoordination(rawDoc: Document, gms: Seq[Mention]): Seq[Seq[ProtoMentionFancy]] = {
     val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMentionFancy]);
     for (sentIdx <- 0 until rawDoc.numSents) {
       // Extract NE spans: filter out O, QUANTITY, CARDINAL, CHUNK
@@ -442,7 +442,7 @@ object CorefDocAssembler {
     new CorefDocAssembler(langPack, useGoldMentions);
   }
   
-  def extractGoldMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, langPack: CorefLanguagePack): (Seq[Mention], OrderedClustering) = {
+  def extractGoldMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer, langPack: CorefLanguagePack): (Seq[Mention], OrderedClustering) = {
     val goldProtoMentionsSorted = getGoldProtoMentionsSorted(rawDoc);
     val finalMentions = new ArrayBuffer[Mention]();
     val goldClusterLabels = new ArrayBuffer[Int]();
@@ -460,7 +460,7 @@ object CorefDocAssembler {
     (finalMentions, OrderedClustering.createFromClusterIds(goldClusterLabels));
   }
   
-  def getGoldProtoMentionsSorted(rawDoc: ConllDoc): Seq[Seq[ProtoMention]] = {
+  def getGoldProtoMentionsSorted(rawDoc: Document): Seq[Seq[ProtoMention]] = {
     val goldProtoMentions = for (sentIdx <- 0 until rawDoc.corefChunks.size) yield {
        for (chunk <- rawDoc.corefChunks(sentIdx)) yield {
          val headIdx = rawDoc.trees(sentIdx).getSpanHead(chunk.start, chunk.end);
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala
index cacd259..41a80e3 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala
@@ -5,13 +5,13 @@ import edu.berkeley.nlp.futile.util.Logger
 import scala.collection.mutable.ArrayBuffer
 import edu.berkeley.nlp.entity.wiki.ACEMunger
 import java.io.File
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 
 class CorefDocAssemblerACE(dirPath: String) {
   
   val langPack = new EnglishCorefLanguagePack()
 
-  def createCorefDoc(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = {
+  def createCorefDoc(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = {
     val (goldMentions, goldClustering) = CorefDocAssembler.extractGoldMentions(rawDoc, propertyComputer, langPack);
     if (goldMentions.size == 0) {
       Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala
index 208c342..85adc64 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala
@@ -39,7 +39,7 @@ import edu.berkeley.nlp.entity.xdistrib.DocumentGraphComponents
 import edu.berkeley.nlp.futile.fig.exec.Execution
 import edu.berkeley.nlp.entity.Driver
 import edu.berkeley.nlp.entity.GUtil
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.WordNetInterfacer
 import edu.berkeley.nlp.entity.ConllDocWriter
 import edu.berkeley.nlp.entity.ConllDocReader
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
index 58b9cd4..c31144a 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
@@ -8,14 +8,15 @@ import edu.berkeley.nlp.entity.sem.SemClasser
 import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
 import edu.berkeley.nlp.futile.util.Counter
 import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.Driver;
 import edu.berkeley.nlp.entity.WordNetInterfacer
 
 // TODO: Extract an interface for ConllDoc so I don't have to keep the whole
 // document around...but while I'm feature engineering it's useful to be able
 // to put my hands on anything I want
-class Mention(val rawDoc: ConllDoc,
+// ... ok settle down
+class Mention(val rawDoc: Document,
               val mentIdx: Int,
               val sentIdx: Int,
               val startIdx: Int,
@@ -39,6 +40,16 @@ class Mention(val rawDoc: ConllDoc,
   var cachedNerPossibilities: Option[Chunk[Counter[String]]] = None;
   var cachedNerGold: Option[Chunk[String]] = None;
 
+  override def toString = {
+    var ret = "{"
+    if(startIdx > 1)
+      ret += rawDoc.words(sentIdx)(startIdx - 1) + " "
+    ret += "["+spanToString+"]"
+    if(endIdx < rawDoc.words(sentIdx).size-1)
+      ret += rawDoc.words(sentIdx)(endIdx+ 1)
+    ret + "}"
+  }
+
   def speaker = rawDoc.speakers(sentIdx)(headIdx);
 
   def headString = rawDoc.words(sentIdx)(headIdx);
@@ -247,7 +258,7 @@ object Mention {
   val StartPosPlaceholder = "<S>";
   val EndPosPlaceholder = "</S>";
   
-  def createMentionComputeProperties(rawDoc: ConllDoc,
+  def createMentionComputeProperties(rawDoc: Document,
                                      mentIdx: Int,
                                      sentIdx: Int,
                                      startIdx: Int,
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala b/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala
index 31c32f6..21b1ac7 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala
@@ -21,6 +21,8 @@ import edu.berkeley.nlp.entity.WordNetInterfacer
  * DO NOT try to add WordNetInterfacer here! It is not serializable and so
  * everything will explode when we try to serialize the model. So we choose
  * to cache it on the documents even though this is pretty hacky.
+ *
+ * TODO: maybe change to using "transient" fields re:^^
  */
 @SerialVersionUID(1L)
 class PairwiseIndexingFeaturizerJoint(val featureIndexer: Indexer[String],
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala
index 512cc27..a78e96f 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala
@@ -15,10 +15,10 @@ import edu.berkeley.nlp.entity.Driver
 import edu.berkeley.nlp.entity.ner.NerFeaturizer
 import scala.collection.mutable.HashSet
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.ner.NerPruner
 
-class JointDoc(val rawDoc: ConllDoc,
+class JointDoc(val rawDoc: Document,
                val docGraph: DocumentGraph,
                val goldNERChunks: Seq[Seq[Chunk[String]]],
                val goldWikiChunks: Seq[Seq[Chunk[String]]]) {
@@ -71,7 +71,7 @@ class JointDoc(val rawDoc: ConllDoc,
 
 object JointDoc {
   
-  def apply(rawDoc: ConllDoc,
+  def apply(rawDoc: Document,
             docGraph: DocumentGraph,
             maybeGoldNERChunks: Option[Seq[Seq[Chunk[String]]]],
             maybeGoldWikiChunks: Option[Seq[Seq[Chunk[String]]]]) = {
@@ -89,7 +89,7 @@ object JointDoc {
   }
   
   def assembleJointDocs(docGraphs: Seq[DocumentGraph],
-                        goldConllDocsForNER: Seq[ConllDoc],
+                        goldConllDocsForNER: Seq[Document],
                         goldWikification: HashMap[String,HashMap[Int,ArrayBuffer[Chunk[String]]]]) = {
     docGraphs.map(docGraph => {
       val rawDoc = docGraph.corefDoc.rawDoc;
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala
index fc78b5e..85c9683 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala
@@ -5,13 +5,13 @@ import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 
 import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.coref.DocumentGraph
 import edu.berkeley.nlp.entity.coref.Mention
 import edu.berkeley.nlp.entity.wiki._
 import edu.berkeley.nlp.futile.util.Logger
 
-class JointDocACE(val rawDoc: ConllDoc,
+class JointDocACE(val rawDoc: Document,
                   val docGraph: DocumentGraph,
                   val goldWikiChunks: Seq[Seq[Chunk[Seq[String]]]]) {
   
@@ -36,7 +36,7 @@ class JointDocACE(val rawDoc: ConllDoc,
 
 object JointDocACE {
   
-  def apply(rawDoc: ConllDoc,
+  def apply(rawDoc: Document,
             docGraph: DocumentGraph,
             maybeGoldWikiChunks: Option[Seq[Seq[Chunk[Seq[String]]]]]) = {
     val goldWikiChunks = if (maybeGoldWikiChunks.isDefined) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala
index 667672b..afeb3f7 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala
@@ -3,7 +3,7 @@ package edu.berkeley.nlp.entity.joint
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.ConllDocReader
 import edu.berkeley.nlp.entity.ConllDocWriter
 import edu.berkeley.nlp.entity.GUtil
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala
index 71e9274..cf93562 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala
@@ -12,7 +12,7 @@ import edu.berkeley.nlp.entity.coref.CorefDocAssembler
 import scala.collection.mutable.HashMap
 import scala.collection.mutable.ArrayBuffer
 import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.coref.DocumentGraph
 import edu.berkeley.nlp.futile.fig.exec.Execution
 import edu.berkeley.nlp.entity.coref.CorefEvaluator
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala
index a0f4c96..0627b42 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala
@@ -1,6 +1,6 @@
 package edu.berkeley.nlp.entity.ner
 
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.coref.Mention
 import edu.berkeley.nlp.futile.util.Logger
 import edu.berkeley.nlp.entity.coref.CorefSystem
@@ -53,11 +53,11 @@ object NEEvaluator {
     }));
   }
   
-  def evaluate(goldDocs: Seq[ConllDoc], predDocs: Seq[ConllDoc]) {
+  def evaluate(goldDocs: Seq[Document], predDocs: Seq[Document]) {
     evaluateChunks(goldDocs, predDocs.map(_.nerChunks));
   }
 
-  def evaluateChunks(goldDocs: Seq[ConllDoc], allPredChunks: Seq[Seq[Seq[Chunk[String]]]]) {
+  def evaluateChunks(goldDocs: Seq[Document], allPredChunks: Seq[Seq[Seq[Chunk[String]]]]) {
     var correct = 0;
     val correctByLabel = new Counter[String];
     var correctSameHead = 0;
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala
index fd9cd40..911ba9c 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala
@@ -2,13 +2,13 @@ package edu.berkeley.nlp.entity.ner
 
 import edu.berkeley.nlp.entity.ConllDocReader
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
 import edu.berkeley.nlp.futile.fig.basic.IOUtils
 
 object NESentenceMunger {
   
-  def writeSentences(file: String, docs: Seq[ConllDoc]) {
+  def writeSentences(file: String, docs: Seq[Document]) {
     val out = IOUtils.openOutHard(file);
     for (doc <- docs; words <- doc.words) {
       out.println(words.foldLeft("")(_ + " " + _).trim);
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala
index 1b7a40f..e73e7c2 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala
@@ -2,7 +2,7 @@ package edu.berkeley.nlp.entity.ner
 
 import scala.collection.mutable.HashMap
 import edu.berkeley.nlp.entity.coref.UID
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.futile.fig.basic.Indexer
 import edu.berkeley.nlp.entity.Driver
@@ -10,14 +10,14 @@ import edu.berkeley.nlp.futile.util.Logger
 
 trait NerPruner {
 
-  def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]];
+  def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]];
 }
 
 @SerialVersionUID(1L)
 class NerPrunerFromModel(val nerModel: NerSystemLabeled,
                          val pruningThreshold: Double) extends NerPruner with Serializable {
   
-  def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = {
+  def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = {
     val sentMarginals = nerModel.computeLogMarginals(doc.words(sentIdx).toArray, doc.pos(sentIdx).toArray);
     NerPruner.pruneFromMarginals(sentMarginals, nerModel.labelIndexer, pruningThreshold);
   }
@@ -28,7 +28,7 @@ class NerPrunerFromMarginals(val nerMarginals: HashMap[UID,Seq[Array[Array[Float
                              val neLabelIndexer: Indexer[String],
                              val pruningThreshold: Double) extends NerPruner with Serializable  {
   
-  def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = {
+  def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = {
     require(nerMarginals.contains(doc.uid), "Doc ID " + doc.uid + " doesn't have precomputed NER marginals" +
             " and the NER pruner in this model is configured to rely on these. You need to either change" +
             " how you specify the pruner (if training) or use a different model entirely (if testing)");
@@ -42,7 +42,7 @@ class NerPrunerFromMarginalsAndModel(val nerMarginals: HashMap[UID,Seq[Array[Arr
                                      val nerModel: NerSystemLabeled,
                                      val pruningThreshold: Double) extends NerPruner with Serializable {
   
-  def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = {
+  def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = {
     val sentMarginals = if (nerMarginals.contains(doc.uid)) {
       nerMarginals(doc.uid)(sentIdx)
     } else {
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala
index 2d1bb7a..7cf1b43 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala
@@ -2,11 +2,10 @@ package edu.berkeley.nlp.entity.ner
 import edu.berkeley.nlp.futile.fig.basic.Indexer
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.JavaConverters._
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity._
 import edu.berkeley.nlp.futile.classify.GeneralLogisticRegression
 import edu.berkeley.nlp.entity.coref.CorefSystem
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.futile.classify.SequenceExample
 import edu.berkeley.nlp.futile.fig.basic.IOUtils
 import java.io.FileInputStream
@@ -15,12 +14,9 @@ import java.io.File
 import java.io.FileOutputStream
 import java.io.ObjectOutputStream
 import edu.berkeley.nlp.futile.util.Counter
-import edu.berkeley.nlp.entity.Chunk
 import scala.collection.mutable.HashMap
-import edu.berkeley.nlp.entity.ConllDocReader
 import edu.berkeley.nlp.entity.lang.Language
 import scala.util.Random
-import edu.berkeley.nlp.entity.ConllDocWriter
 import edu.berkeley.nlp.math.SloppyMath
 import edu.berkeley.nlp.entity.wiki.WikipediaInterface
 import edu.berkeley.nlp.entity.coref.UID
@@ -194,7 +190,8 @@ object NerSystemLabeled {
 //    transitionMatrix.map(_.map(arr => if (arr != null) arr.map(featureIndexer.getIndex(_)) else null));
 //  }
   
-  def replaceNer(doc: ConllDoc, newChunks: Seq[Seq[Chunk[String]]]) = {
+  def replaceNer(doc: Document, newChunks: Seq[Seq[Chunk[String]]]) = {
+    // MFL TODO: ?? need to make it work either way?
     new ConllDoc(doc.docID, doc.docPartNo, doc.words, doc.pos, doc.trees, newChunks, doc.corefChunks, doc.speakers);
   }
   
@@ -227,7 +224,7 @@ object NerSystemLabeled {
   
   // TRAINING
   
-  def trainNerSystem(trainDocs: Seq[ConllDoc],
+  def trainNerSystem(trainDocs: Seq[Document],
                      maybeBrownClusters: Option[Map[String,String]],
                      nerFeatureSet: Set[String],
                      reg: Double,
@@ -267,7 +264,7 @@ object NerSystemLabeled {
   
   // EVALUATION
   
-  def evaluateNerSystem(nerSystem: NerSystemLabeled, testDocs: Seq[ConllDoc]) {
+  def evaluateNerSystem(nerSystem: NerSystemLabeled, testDocs: Seq[Document]) {
     val labelIndexer = nerSystem.labelIndexer;
     Logger.logss("Extracting test examples");
     val testExamples = extractNerChunksFromConll(testDocs);
@@ -332,7 +329,7 @@ object NerSystemLabeled {
     }
   }
   
-  def extractNerChunksFromConll(docs: Seq[ConllDoc]): Seq[NerExample] = {
+  def extractNerChunksFromConll(docs: Seq[Document]): Seq[NerExample] = {
     val chunkTypeCounts = new Counter[String];
     val examples = docs.flatMap(doc => {
       val chunksToUse = doc.nerChunks
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java b/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java
index 1d3a0d7..78fba09 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java
@@ -12,6 +12,7 @@
 import edu.berkeley.nlp.PCFGLA.TreeAnnotations;
 import edu.berkeley.nlp.entity.ConllDocJustWords;
 import edu.berkeley.nlp.entity.ConllDocReader;
+import edu.berkeley.nlp.entity.WikiDocReader;
 import edu.berkeley.nlp.entity.lang.Language;
 import edu.berkeley.nlp.entity.ner.NerSystemLabeled;
 import edu.berkeley.nlp.futile.fig.basic.IOUtils;
@@ -92,7 +93,7 @@ public class PreprocessingDriver implements Runnable {
   public static boolean useAlternateTokenizer = false;
   
   public static enum Mode {
-    RAW_TEXT, CONLL_JUST_WORDS, REDO_CONLL;
+    RAW_TEXT, CONLL_JUST_WORDS, REDO_CONLL, WIKILIMITED;
   }
   
   public static void main(String[] args) {
@@ -128,6 +129,9 @@ public void run() {
           Logger.logss("Processed document " + docName + " and wrote result to " + outputDir);
         }
         writer.close();
+      } else if (mode == Mode.WIKILIMITED) {
+          WikiDocReader docReader = new WikiDocReader(Language.ENGLISH, "");
+          WikiPreprocessor.processesDocs(inputDir + "/", outputDir + "/", docReader, splitter, parser, backoffParser, nerSystem);
       } else {
         ConllDocReader docReader = new ConllDocReader(Language.ENGLISH, "");
         for (File inputFile : new File(inputDir).listFiles()) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala
index 19ac409..9e8ee9e 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala
@@ -1,7 +1,7 @@
 package edu.berkeley.nlp.entity.preprocess
 
 import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import java.io.PrintWriter
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala
index 8ac70d1..85c7a97 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala
@@ -1,5 +1,5 @@
 package edu.berkeley.nlp.entity.preprocess
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.coref.CorefSystem
 import scala.io.Source
 import scala.collection.mutable.ArrayBuffer
@@ -99,8 +99,8 @@ object SentenceSplitter {
     
     def featurize(featureIndexer: Indexer[String], addToIndexer: Boolean): Array[Int] = {
       val featStrs = new ArrayBuffer[String];
-      val pw = prevWord;
-      val fw = followingWord;
+      val pw = if(prevWord.isEmpty) " " else prevWord
+      val fw = if (followingWord.isEmpty) " " else followingWord
       val fwcls = (if (Character.isUpperCase(fw.charAt(0))) "UC" else if (Character.isLowerCase(fw.charAt(0))) "LC" else if (!Character.isLetterOrDigit(fw.charAt(0))) "PU" else "OTHER");
       featStrs += ("Bias=1");
       featStrs += ("LastChar=" + pw.last);
@@ -242,7 +242,7 @@ object SentenceSplitter {
   }
   
   
-  private def readExamplesFromConll(docs: Seq[ConllDoc]): Seq[SplitExample] = {
+  private def readExamplesFromConll(docs: Seq[Document]): Seq[SplitExample] = {
     // N.B. we only loop up until size - 1 since the end of the last sentence
     // has no following context and isn't a good training example.
     // We extract pretty much all positives except for really weird stuff.
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
new file mode 100644
index 0000000..21af271
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -0,0 +1,282 @@
+package edu.berkeley.nlp.entity.preprocess
+
+import java.io.File
+
+import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
+import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder
+import edu.berkeley.nlp.entity._
+import edu.berkeley.nlp.entity.ner.NerSystemLabeled
+import edu.berkeley.nlp.futile.util.Logger
+import edu.berkeley.nlp.futile.syntax.Tree
+import edu.berkeley.nlp.futile.fig.basic.Indexer
+import edu.berkeley.nlp.futile.fig.basic.IOUtils
+
+import scala.collection.mutable.{ArrayBuffer, ListBuffer}
+import scala.xml._
+import scala.concurrent._
+import scala.collection.JavaConverters._
+
+import ExecutionContext.Implicits.global
+
+/**
+ * Created by matthew on 2/21/15.
+ */
+object WikiPreprocessor {
+
+  val headFinder = new ModCollinsHeadFinder()
+
+  def processesDocs (inputDir : String, outputDir : String,
+                     docReader : WikiDocReader,
+                     splitter : SentenceSplitter,
+                     parser : CoarseToFineMaxRuleParser,
+                     backoffParser : CoarseToFineMaxRuleParser,
+                     nerSystem : NerSystemLabeled) = {
+    val wikiDocs = new File(inputDir).listFiles.par.map(file => {
+      val input_file = file.getAbsolutePath
+      val output_file = outputDir + file.getName
+      try {
+        process(input_file, output_file, docReader, splitter, parser.newInstance, backoffParser.newInstance, nerSystem)
+      } catch {
+        case e : Exception => {
+          Logger.logss("failed file: "+input_file)
+          System.err.print(e.toString)
+          e.printStackTrace(System.err)
+          null
+        }
+      }
+    }).filter(_ != null).toList
+    GUtil.save(wikiDocs.asInstanceOf[Serializable], outputDir + "wiki-docs.doc.ser.gz")
+  }
+
+  def process(inputFile : String, outputFile : String,
+              docReader : WikiDocReader,
+              splitter : SentenceSplitter,
+              parser : CoarseToFineMaxRuleParser,
+              backoffParser : CoarseToFineMaxRuleParser,
+              nerSystem : NerSystemLabeled) : WikiDoc = {
+    val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem)
+    val lines = wikiToConllLines(wdoc)
+    //val wlines = wiki.WikiAnnotReaderWriter.getWikiBits(wdoc.words.map(_.size), wdoc.wikiRefChunks)
+    val wlines = wikiToWikiLines(wdoc)
+    //PreprocessingDriver.writeConllLines(wdoc.docID, lines.map(_.toArray).toArray, outputFile)
+    writeWikiLines(wdoc.docID, lines, outputFile)
+    writeWikiLines(wdoc.docID, wlines, outputFile.replace("raw", "wiki"))
+    wdoc
+  }
+
+  def writeWikiLines(docID : String, lines : Seq[Seq[String]], outputFile : String) = {
+    var writer = IOUtils.openOutHard(outputFile)
+    writer.println("#begin document (" + docID + "); part 000")
+    lines.foreach(l => {
+      l.foreach(writer.println(_))
+      writer.println
+    })
+    writer.close()
+  }
+
+  def wikiToConllLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+    val ret = ListBuffer[Seq[String]]()
+    //ret.append("#begin document (" + wdoc.docID + "); part " + wdoc.docPartNo)
+    for(i <- 0 until wdoc.numSents) {
+      val parseBits = PreprocessingDriver.computeParseBits(Reprocessor.convertFromFutileTree(wdoc.trees(i).constTree))
+      //val nerBits = PreprocessingDriver.computeNerBits(wdoc.nerChunks(i).toArray)
+      val corefBits = computeBits(wdoc.corefChunks(i), wdoc.words(i).size)
+      var lines = new ListBuffer[String]()
+      // conll: [doc name] [part num] [word num] [word] [pos] [parsebit] [6] [7] [8] [speakers] [nerbit] [corefbit]
+      for(j <- 0 until wdoc.words(i).size) {
+        lines.append(wdoc.docID + "\t" +
+          wdoc.docPartNo + "\t" +
+          j + "\t" +
+          wdoc.words(i)(j) + "\t" +
+          wdoc.pos(i)(j) + "\t" +
+          parseBits(j) + "\t" +
+          "\t-\t-\t-\t" +
+          "-\t" + // speakers
+          "*\t" + // nerbit
+          corefBits(j) + "\t" // coref bits
+        )
+      }
+      ret.append(lines.toSeq)
+    }
+    ret.toSeq
+  }
+
+  def computeBits[T](items : Seq[Chunk[T]], len : Int) : Array[String] = {
+    var ret = Array.fill(len)(List[String]())
+    items.foreach(c => {
+      if(c.start == c.end -1) {
+        ret(c.start) = ret(c.start) :+ ("(" + c.label + ")")
+      } else {
+        ret(c.start) = ret(c.start) :+ ("(" + c.label)
+        ret(c.end - 1) = ret(c.end - 1) :+ (c.label + ")")
+      }
+    })
+    ret.map(i => {if(i.isEmpty) "-" else i.reduce(_+"|"+_)})
+  }
+
+  /*def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+    // this does not handle multiple chunks on the same span well, but that shouldn't be an issue, since wiki docs shouldn't have that
+    val ret = ListBuffer[Seq[String]]()
+    for(i <- 0 until wdoc.numSents) {
+      val lines = new ListBuffer[String]()
+      for(j <- 0 until wdoc.words(i).size) {
+        var s = ""
+        wdoc.wikiRefChunks(i).foreach(c => {
+          if(c.start == j)
+            s = "(" + c.label
+        })
+        s += "*"
+        wdoc.wikiRefChunks(i).foreach(c => {
+          if(c.end == j + 1)
+            s += ")"
+        })
+        lines.append(s)
+      }
+      ret.append(lines.toSeq)
+    }
+    ret.toSeq
+  }*/
+
+  def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+    for (sentIdx <- 0 until wdoc.words.size) yield {
+      for (tokenIdx <- 0 until wdoc.words(sentIdx).size) yield {
+        val chunksStartingHere = wdoc.wikiRefChunks(sentIdx).filter(chunk => chunk.start == tokenIdx).sortBy(- _.end);
+        val numChunksEndingHere = wdoc.wikiRefChunks(sentIdx).filter(chunk => chunk.end - 1 == tokenIdx).size;
+        var str = if(chunksStartingHere.isEmpty) "" else {
+          chunksStartingHere.map("("+_.label.replace("(", "-LRB-").replace(")", "-RRB-").replace("*", "-STAR-")).reduce(_+"|"+_)
+        }
+        str += "*";
+        str += ")" * numChunksEndingHere
+        str;
+      }
+    }
+  }
+
+
+  def mkWikiDoc(inputFile : String,
+              docReader : WikiDocReader,
+              splitter : SentenceSplitter,
+              parser : CoarseToFineMaxRuleParser,
+              backoffParser : CoarseToFineMaxRuleParser,
+              nerSystem : NerSystemLabeled) : WikiDoc = {
+
+    Logger.logss("starting processing of " + inputFile)
+    val referencesFile = inputFile.replace("RawTexts", "Problems")
+    val refxml = XML.loadFile(referencesFile)
+    val document = scala.io.Source.fromFile(inputFile).mkString.split("\n")
+    val refname = (refxml \ "ReferenceFileName")(0).text.trim
+
+
+    val references = (refxml \ "ReferenceInstance").map(r => (
+      (r \ "SurfaceForm")(0).text.trim,
+      (r \ "Offset")(0).text.trim.toInt,
+      (r \ "Length")(0).text.trim.toInt,
+      (r \ "ChosenAnnotation")(0).text.trim,
+      (r \ "AnnotatorId")(0).text.trim,
+      (r \ "Annotation")(0).text.trim
+      ))
+
+    val canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(document, false, false)
+    val sentences = splitter.splitSentences(canonicalizedParagraphs)
+    val tokens = SentenceSplitter.tokenize(sentences)
+
+
+    val doclenratio = sentences.map(_.size).sum.toFloat / document.map(_.size + 1).sum
+    def refFinder (ref : (String, Int, Int, String, String, String)) : (Int, Chunk[String]) = {
+      val d = doclenratio * (ref._2 + ref._3 / 2.0)
+      var cnt = 0
+      val wrds = ref._1.replace(" ", "")
+
+      if(wrds.isEmpty) // wtf, how does not create an empty citation???
+        return (-1, null)
+
+      def rank_match(i : Int, j : Int) : Double = {
+        val res = tokens(i).drop(j).reduce(_+_)
+        for(q <- 0 until Math.min(wrds.size, res.size)) {
+          if (res(q) != wrds(q))
+            return q.toDouble / wrds.size
+        }
+        1.0
+      }
+      var best_start = 0
+      var best_rank = Double.NegativeInfinity
+      var best_sentence = 0
+      for(i <- 0 until sentences.size) {
+        var tcnt = 0
+        for(j <- 0 until tokens(i).size) {
+          val r = rank_match(i, j) / Math.log(Math.abs(d - cnt - tcnt) + 2) // little to simple, but works in most cases
+          if(r > best_rank) {
+            best_rank = r
+            best_start = j
+            best_sentence = i
+          }
+          tcnt += tokens(i)(j).size + 1 // +1 to match the space
+        }
+        cnt += sentences(i).size
+      }
+      var len = 0
+      var len_cnt = 0
+      for(j <- best_start until tokens(best_sentence).size; if len_cnt < wrds.size) {
+        len_cnt += tokens(best_sentence)(j).size
+        len += 1
+      }
+      if(len == 0)
+        return (-1, null)
+      (best_sentence, new Chunk(best_start, best_start + len, ref._4))
+    }
+
+    val refplaces = references.map(refFinder)
+
+    val refsorted = refplaces.foldLeft(Map[Int, List[Chunk[String]]]().withDefaultValue(List()))((m, itm) => {
+      if(itm._1 != -1) {
+        m.updated(itm._1, m(itm._1) :+ itm._2)
+      } else
+        m
+    })
+
+    val parses: Array[Tree[String]] = tokens.map(t => {
+      //try {
+        Reprocessor.convertToFutileTree(
+          PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava))
+      /*} catch {
+        case e : java.lang.NullPointerException => {
+          null;
+        }
+      }*/
+    })
+
+    // ... filter out the ones where the parses don't match, idk how that is going to effect
+    val tps = (tokens, parses, 0 until tokens.size).zipped
+      .filter((a,b,c) => a.length == b.getYield.size)
+
+    val indexer = new Indexer[String]()
+
+    val pos = tps._2.map(t => { new ArrayBuffer[String] ++ t.getPreTerminalYield.asScala })
+
+    val trees = for(i <- 0 until tps._1.size) yield {
+      val childParentMap = DepConstTree.extractDependencyStructure(tps._2(i), headFinder)
+      new DepConstTree(tps._2(i), pos(i), tps._1(i), childParentMap)
+    }
+
+    val empty = tps._1.map(l => (0 until l.length).map(a=>"-")).toSeq
+
+    val wikiDoc = new WikiDoc(
+      docID=inputFile,
+      docPartNo=refname.toInt,
+      words=tps._1.toSeq.map(_.toSeq),
+      pos=pos,
+      trees=trees,
+      nerChunks=tps._1.map(a=>Seq()), // todo
+      corefChunks=tps._3.map(i => {
+        refsorted(i).map(c => new Chunk(c.start, c.end, indexer.getIndex(c.label)))
+      }),
+      speakers=empty, // todo?
+      wikiRefChunks=tps._3.map(refsorted(_))
+    )
+
+    Logger.logss("done with "+inputFile)
+
+    wikiDoc
+  }
+
+}
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala
new file mode 100644
index 0000000..acd575e
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala
@@ -0,0 +1,42 @@
+package edu.berkeley.nlp.entity.wiki
+
+import edu.berkeley.nlp.futile.LightRunner
+
+/**
+ * Created by matthewfl
+ *
+ * We want to work with the who document at a time rather then just a single link
+ * this will allow us to
+ */
+class DocumentedSetChooser {
+
+}
+
+
+object DocumentedSetChooser {
+
+  val trainDataPath = "data/ace05/train";
+  val testDataPath = "data/ace05/dev";
+  val wikiPath = "data/ace05/ace05-all-conll-wiki" // contains the wiki links for both items
+  val wikiDBPath = "models/wiki-db-ace.ser.gz"
+
+  val lambda = 1e-8F
+  val batchSize = 1
+  val numItrs = 20
+
+  val maxNumWikificationOptions = 20 //7
+
+  val numLoadedSamples = -1 // for debugging by loading less samples
+
+
+  def main(args: Array[String]) = {
+    LightRunner.initializeOutput(DocumentedSetChooser.getClass)
+    LightRunner.populateScala(DocumentedSetChooser.getClass, args)
+
+    // load the documents
+
+
+
+    LightRunner.finalizeOutput()
+  }
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 4d03771..b145732 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -3,16 +3,14 @@ package edu.berkeley.nlp.entity.wiki
 import edu.berkeley.nlp.entity.lang.Language
 import edu.berkeley.nlp.futile.LightRunner
 import edu.berkeley.nlp.entity.coref.CorefDocAssembler
-import edu.berkeley.nlp.entity.ConllDocReader
+import edu.berkeley.nlp.entity._
 import edu.berkeley.nlp.entity.coref.MentionPropertyComputer
-import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.futile.fig.basic.Indexer
 import edu.berkeley.nlp.entity.joint.LikelihoodAndGradientComputer
 import scala.collection.mutable.ArrayBuffer
 import edu.berkeley.nlp.entity.coref.CorefDoc
 import edu.berkeley.nlp.futile.math.SloppyMath
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.Chunk
 import edu.berkeley.nlp.entity.joint.GeneralTrainer
 
 /**
@@ -31,6 +29,8 @@ case class JointQueryDenotationExample(val queries: Seq[Query],
   // Feature caches since feature computation is expensive if redone every time
   var cachedFeatsEachQuery: Array[Array[Int]] = null;
   var cachedFeatsEachQueryDenotation: Array[Array[Array[Int]]] = null;
+
+  def document = queries.head.originalMent.rawDoc
 }
 
 /**
@@ -42,11 +42,13 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
                                          val featureIndexer: Indexer[String]) extends LikelihoodAndGradientComputer[JointQueryDenotationExample] {
   // Used for feature computation
   val queryChooser = new QueryChoiceComputer(wikiDB, featureIndexer)
-  
+
   def featurizeUseCache(ex: JointQueryDenotationExample, addToIndexer: Boolean) {
     if (ex.cachedFeatsEachQuery == null) {
+      if(ex.document.documentVectorCache == null)
+        ex.document.documentVectorCache = wikiDB.textDB.makeVector(ex.document.words)
       ex.cachedFeatsEachQuery = queryChooser.featurizeQueries(ex.queries, addToIndexer)
-      ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations(ex.queries, ex.allDenotations, addToIndexer)
+      ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations_GLOW(ex.queries, ex.allDenotations, addToIndexer, wikiDB)
     }
   }
   
@@ -55,13 +57,17 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
    */
   def getUnnormalizedJointScores(ex: JointQueryDenotationExample, weights: Array[Float]): Array[Array[Float]] = {
     featurizeUseCache(ex, false)
+    // each example will have a number of features associated with each query
+    // each feature is an indicator, so we use the cache of the features indexes
+    // and sum the values of the features
     val rawQueryScores = ex.cachedFeatsEachQuery.map(feats => GUtil.scoreIndexedFeats(feats, weights));
+    // these are the weights from each query wrt the various word choices
     val queryDenotationMatrix = ex.cachedFeatsEachQueryDenotation.map(_.map(feats => GUtil.scoreIndexedFeats(feats, weights)));
     val scores = Array.tabulate(ex.queries.size, ex.allDenotations.size)((i, j) => Float.NegativeInfinity)
-    for (queryIdx <- 0 until ex.queries.size) {
-      for (denotationIdx <- 0 until ex.allDenotations.size) {
-        scores(queryIdx)(denotationIdx) = rawQueryScores(queryIdx) + queryDenotationMatrix(queryIdx)(denotationIdx)
-      }
+    for (queryIdx <- 0 until ex.queries.size; denotationIdx <- 0 until ex.allDenotations.size) {
+      // These are indicator weights, so by summing them we can compute the resulting value of choosing a given word
+      // and a given query by combining the results of the dot product of the query and the denotation
+      scores(queryIdx)(denotationIdx) = rawQueryScores(queryIdx) + queryDenotationMatrix(queryIdx)(denotationIdx)
     }
     scores
   }
@@ -72,7 +78,9 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
    */
   def getDenotationLogMarginals(ex: JointQueryDenotationExample, weights: Array[Float]): Array[Float] = {
     val scores = getUnnormalizedJointScores(ex, weights)
-    // Sum up each column
+    // the scores matrix contains log(p_{i,j}), so we are using
+    // logAdd to sum the probabilities
+    // as p(q,d) \propto e^(w^T f(q,d))
     val rawDenotationMarginals = Array.tabulate(ex.allDenotations.size)(i => SloppyMath.logAdd(scores.map(_(i))).toFloat)
     val normalizer = SloppyMath.logAdd(rawDenotationMarginals).toFloat
     (0 until rawDenotationMarginals.size).foreach(i => rawDenotationMarginals(i) -= normalizer)
@@ -132,12 +140,47 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
 class JointQueryDenotationChooser(val featureIndexer: Indexer[String],
                                   val weights: Array[Float]) extends Serializable {
   
-  def pickDenotation(queries: Seq[Query], wikiDB: WikipediaInterface): String = {
+  /*def pickDenotation(queries: Seq[Query], wikiDB: WikipediaInterface): String = {
     val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
-    val denotations = queries.map(query => wikiDB.disambiguateBestNoDisambig(query));
+    val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
     val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]());
     computer.computeDenotation(ex, weights)
+  }*/
+
+  def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : (Seq[(String, Int)], Array[Array[Int]]) = {
+    val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
+    val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
+    val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions)
+    val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]());
+    val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
+
+    (ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse,
+      ex.cachedFeatsEachQuery)
+  }
+
+  def printEverything(queries: Seq[Query], wikiDB: WikipediaInterface, correctInd: Int) = {
+    // just redo the computations so gg
+    val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
+    val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
+    val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions)
+    val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]());
+    val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
+
+    val sortedItms = ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse
+
+    println(
+      s"""Correct item in $correctInd (${sortedItms(correctInd)._1})
+         |\tGuessed value: ${sortedItms(0)._1}""".stripMargin)
+    for(i <- 0 until queries.length) {
+      println("\t\t"+i+": "+queries(i))
+      println("\t\t"+ex.cachedFeatsEachQuery(i).map(featureIndexer.getObject(_)).mkString(" "))
+      for(j <- 0 until ex.allDenotations.length) {
+        println("\t\t\t"+j+": "+ex.allDenotations(j)+": "+ex.cachedFeatsEachQueryDenotation(i)(j).map(featureIndexer.getObject(_)).mkString(" "))
+      }
+    }
+    println()
   }
+
 }
 
 object JointQueryDenotationChooser {
@@ -159,11 +202,17 @@ object JointQueryDenotationChooser {
           // There are multiple possible gold Wikipedia titles for some mentions. Note that
           // NIL (no entry in Wikipedia) is included as an explicit choice, so this includes NILs (as
           // it should according to how the task is defined)
-          val goldLabel = getGoldWikification(goldWikification(docName), ment)
+          val goldLabelp = getGoldWikification(goldWikification(docName), ment)
+          val goldLabel = (goldLabelp ++ goldLabelp.map(wikiDB.redirectsDB.followRedirect(_))).distinct
           if (goldLabel.size >= 1) {
+            //val oldqueries = Query.extractQueriesBest_old(ment, true);
             val queries = Query.extractQueriesBest(ment, true);
-            val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_));
+            /*if(!(Set(oldqueries.map(_.getFinalQueryStr):_*) subsetOf Set(queries.map(_.getFinalQueryStr):_*))) {
+              println("failed...")
+            }*/
+            //val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_));
 //            val denotations = queries.map(wikiDB.disambiguateBestNoDisambig(_));
+            val queryDisambigs = queries.map(wikiDB.disambigRes(_))
             val denotations = Query.extractDenotationSetWithNil(queries, queryDisambigs, maxNumWikificationOptions);
             val correctDenotations = denotations.filter(denotation => isCorrect(goldLabel, denotation))
             // N.B. The use of "isCorrect" here is needed to canonicalize 
@@ -171,6 +220,10 @@ object JointQueryDenotationChooser {
 //            if (correctIndices.isEmpty && 
             if (filterImpossible && correctIndices.isEmpty) {
               numImpossible += 1;
+              //println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations)
+              /*if(goldLabel.contains("Lord_Speaker")) {
+                println("wtfwtf")
+              }*/
             } else {
               exs += new JointQueryDenotationExample(queries, denotations, correctDenotations, goldLabel)
             }
@@ -182,26 +235,52 @@ object JointQueryDenotationChooser {
     exs;
   }
 
+
+  def loadDocuments(path : String) = {
+    val limit = numLoadedSamples//500
+    if(path.startsWith("wikiser:")) {
+      WikiDocReader.loadRawWikiDocs(path.split(":")(1), limit, "", Language.ENGLISH)
+    } else {
+      ConllDocReader.loadRawConllDocsWithSuffix(path, limit, "", Language.ENGLISH)
+    }
+  }
+
   
   val trainDataPath = "data/ace05/train";
   val testDataPath = "data/ace05/dev";
-  val wikiPath = "data/ace05/ace05-all-conll-wiki"
+  val wikiPath = "data/ace05/ace05-all-conll-wiki" // contains the wiki links for both items
   val wikiDBPath = "models/wiki-db-ace.ser.gz"
   
   val lambda = 1e-8F
   val batchSize = 1
   val numItrs = 20
   
-  val maxNumWikificationOptions = 7
+  val maxNumWikificationOptions = 20 //7
+
+  val numLoadedSamples = -1 // for debugging by loading less samples
   
   def main(args: Array[String]) {
     LightRunner.initializeOutput(JointQueryDenotationChooser.getClass());
     LightRunner.populateScala(JointQueryDenotationChooser.getClass(), args)
     // Read in CoNLL documents 
     val assembler = CorefDocAssembler(Language.ENGLISH, true);
-    val trainDocs = ConllDocReader.loadRawConllDocsWithSuffix(trainDataPath, -1, "", Language.ENGLISH);
-    val trainCorefDocs = trainDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None)));
-    
+    val trainDocs = loadDocuments(trainDataPath);
+    val trainCorefDocs = trainDocs.map(doc => {
+      try {
+        assembler.createCorefDoc(doc, new MentionPropertyComputer(None))
+      } catch {
+        case e : Exception => {
+          // TODO: fix the wikidocument parser
+          println("failed document "+doc.docID)
+          null
+        }
+      }
+    }).filter(_!=null);
+
+    //val testDocs = ConllDocReader.loadRawConllDocsWithSuffix(testDataPath, -1, "", Language.ENGLISH);
+    val testDocs = loadDocuments(testDataPath)
+    val testCorefDocs = testDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None)));
+
     // Read in gold Wikification labels
     val goldWikification = WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiPath)
     // Read in the title given surface database
@@ -209,7 +288,9 @@ object JointQueryDenotationChooser {
     // Make training examples, filtering out those with solutions that are unreachable because
     // they're not good for training
     val trainExs = extractExamples(trainCorefDocs, goldWikification, wikiDB, filterImpossible = true)
-    
+
+    // going to have make this system work on a set of a document
+
     // Extract features
     val featIndexer = new Indexer[String]
     val computer = new JointQueryDenotationChoiceComputer(wikiDB, featIndexer);
@@ -225,16 +306,63 @@ object JointQueryDenotationChooser {
     
     // Build the test examples and decode the test set
     // No filtering now because we're doing test
-    val testDocs = ConllDocReader.loadRawConllDocsWithSuffix(testDataPath, -1, "", Language.ENGLISH);
-    val testCorefDocs = testDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None)));
-    val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = false);
-    val goldTestDenotationsAsTrivialChunks = (0 until testExs.size).map(i => new Chunk[Seq[String]](i, i+1, testExs(i).rawCorrectDenotations))
-    val predTestDenotationsAsTrivialChunks = (0 until testExs.size).map(i => new Chunk[String](i, i+1, chooser.pickDenotation(testExs(i).queries, wikiDB)))
+
+    val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = true)//false);
+
+    var correctItemWasInSet = 0
+
+    val results = testExs.map(t => {
+      // TODO: need more then one perdicted title
+      val (picks, denFeats) = chooser.pickDenotations(t.queries, wikiDB)
+      if(!isCorrect(t.rawCorrectDenotations, picks(0)._1)) {
+        // the pick is not correct, attempt to determine if there would have
+        // been a better pick that is in the picks list (which basically means all of the
+        /*if(picks.size > 1 && isCorrect(t.rawCorrectDenotations, picks(1))) {
+          // the correct pick was the second answer instead of the first one
+          // try and report the differences between the two items
+          println("second pick was correct")
+
+        }*/
+        var qq = -1
+        for((p, i) <- picks.zipWithIndex) {
+          // try: t.correctDenotations here?
+          if(isCorrect(t.correctDenotations, p._1) || isCorrect(t.rawCorrectDenotations, p._1)) {
+            //println("Found correct item with "+i)
+            correctItemWasInSet += 1
+            qq = i
+            //println("found correct item")
+          }
+        }
+        if(qq != -1) {
+          chooser.printEverything(t.queries, wikiDB, qq)
+          /*println(
+            s"""Correct item in place: $qq
+                |\tcorrect value: ${picks(qq)}
+                |\t\t${denFeats(picks(qq)._2).flatMap(featIndexer.getObject(_)).mkString(" ")}
+                |\tchosen value : ${picks(0)}
+                |\t\t${denFeats(picks(0)._2).flatMap(featIndexer.getObject(_)).mkString(" ")}
+              """.stripMargin)
+*/
+        } else {
+          println("THIS QUERY SHOULD HAVE BEEN FILTERED")
+        }
+      }
+      (t.rawCorrectDenotations, picks.map(_._1), t.queries(0).originalMent.rawDoc)
+    })
+
+    val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1))
+    val predTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[String](i, i+1, results(i)._2(0)))
     
     // Hacky but lets us reuse some code that normally evaluates things with variable endpoints
 //    WikificationEvaluator.evaluateWikiChunksBySent(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks))
     WikificationEvaluator.evaluateFahrniMetrics(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks), Set())
-    
+
+    val mentionsByDoc = results.groupBy(_._3)
+
+    WikificationEvaluator.evaluateBOTF1_mfl(mentionsByDoc)
+    println("Number of correct items that were in the set: "+correctItemWasInSet)
+
+
     LightRunner.finalizeOutput();
   }
   
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
index ce86957..71a1869 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
@@ -16,7 +16,8 @@ case class Query(val words: Seq[String],
                  val originalMent: Mention,
                  val finalSpan: (Int, Int),
                  val queryType: String,
-                 val removePuncFromQuery: Boolean = true) {
+                 val removePuncFromQuery: Boolean = true,
+                 val features: List[String] = List[String]()) {
   
   def getFinalQueryStr = {
     val wordsNoPunc = if (removePuncFromQuery) {
@@ -40,9 +41,16 @@ object Query {
   val PluralQueryExpand = true;
   val RemovePuncFromQuery = true;
   val UseFirstHead = true;
-  val MaxQueryLen = 4;
-  val BlackList = Set("the", "a", "my", "your", "his", "her", "our", "their", "its", "this", "that", "these", "those")
-  val PuncList = Set(',', '.', '!', '?', ':', ';', '\'', '"', '(', ')', '[', ']', '{', '}', ' ');
+  val MaxQueryLen = 8;
+  val BlackList = Set(
+    "the", "a", "my", "your", "his", "her", "our",
+    "their", "its", "this", "that", "these", "those",
+    "of"
+  )
+  val PuncList = Set(
+    ',', '.', '!', '?', ':', ';', '\'', '"', '(', ')',
+    '[', ']', '{', '}', ' '
+  )
   
   /**
    * Check if a token is "blacklisted", meaning that we shouldn't form a query that starts with
@@ -73,7 +81,7 @@ object Query {
    * considering different subsets of the words in the mention and munging capitalization and
    * stemming, since lowercasing and dropping a plural-marking "s" are useful for nominals.
    */
-  def extractQueriesBest(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = {
+  def extractQueriesBest_old(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = {
     val queries = new ArrayBuffer[Query];
     val mentWords = ment.words;
     // Try the whole query, then prefixes ending in the head
@@ -92,6 +100,7 @@ object Query {
       if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) {
         queriesThisSlice += new Query(Seq(wikiCase(firstWord)) ++ mentWords.slice(indices._1 + 1, indices._2), ment, indices, "WIKICASED", RemovePuncFromQuery);
       }
+
       // Stemming (but only on head alone)
       if (PluralQueryExpand && (indices._2 - indices._1) == 1 && firstWord.last == 's') {
         queriesThisSlice ++= queriesThisSlice.map(query => new Query(Seq(removePlural(query.words(0))), ment, indices, query.queryType + "-STEM", RemovePuncFromQuery));
@@ -107,6 +116,52 @@ object Query {
 //    }
     queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]());
   }
+
+  def extractQueriesBest(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = {
+    val queries = new ArrayBuffer[Query]()
+    val mentWords = ment.words
+    val relHeadIdx = ment.contextTree.getSpanHeadACECustom(ment.startIdx, ment.endIdx) - ment.startIdx
+    def addQuery(start: Int, end: Int, featsi:List[String]): Unit = {
+      var feats = featsi // gaaaaa
+      val thisSlice = new ArrayBuffer[Query]()
+      val wrds = mentWords.slice(start, end)
+      thisSlice += new Query(wrds, ment, (start, end), "STD", true, feats ++ List("RemovedPunc"))
+      thisSlice += new Query(wrds, ment, (start, end), "STD", false, feats ++ List("IncludePunc"))
+      val firstWord = wrds(0)
+      val lastWord = wrds(wrds.size - 1)
+      if((end - start)== 1)
+        feats ++= List("SingleItemQuery")
+      if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) {
+        thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", true, feats ++ List("RemovedPunc"));
+        thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", false, feats ++ List("IncludePunc"));
+      }
+      // Stemming (but only on head alone)
+      if (PluralQueryExpand && (end - start) == 1 && firstWord.last == 's') {
+        thisSlice ++= thisSlice.map(qu =>
+          new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", true, feats ++ List("RemovedPunc")));
+        thisSlice ++= thisSlice.map(qu =>
+          new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", false, feats ++ List("IncludePunc")));
+
+      }
+      queries ++= thisSlice
+    }
+    addQuery(0, ment.endIdx - ment.startIdx, List("SimpleQuery", "FullTextQuery"))
+    // TODO: make this ignore items that simply add a blacklisted word
+    for(i <- 0 to relHeadIdx) {
+      addQuery(i, relHeadIdx + 1, List("SimpleQuery", "PreHeadQuery"))
+    }
+    for(i <- relHeadIdx+1 until mentWords.size) {
+      addQuery(relHeadIdx, i, List("SimpleQuery", "PostHeadQuery"))
+    }
+    // try filtering words
+    val filterWords = mentWords.filter(!isBlacklisted(_, 0))
+    if(filterWords.size != mentWords.size) {
+      // we lost something, make new query
+      queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", true , List("FilteredQuery", "RemovedPunc"))
+      queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", false, List("FilteredQuery", "IncludePunc"))
+    }
+    queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]())
+  }
   
   def extractDenotationSetWithNil(queries: Seq[Query], queryDisambigs: Seq[Counter[String]], maxDenotations: Int): Seq[String] = {
     val choicesEachQuery = queryDisambigs.map(_.getSortedKeys().asScala);
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
index e3b4d32..712ad3d 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
@@ -18,6 +18,8 @@ import edu.berkeley.nlp.entity.ConllDocReader
 import edu.berkeley.nlp.entity.coref.CorefDocAssembler
 import edu.berkeley.nlp.entity.coref.MentionPropertyComputer
 
+import scala.collection.mutable
+
 case class QueryChoiceExample(val queries: Seq[Query],
                               val denotations: Seq[String],
                               val correctQueryIndices: Array[Int]) {
@@ -123,6 +125,176 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
       val longQuery = tagsWithin.size > 3;
       feat("DescriptorQueryTags=" + queryDescriptor + "-" + contextTag + (if (longQuery) "...") + tagsWithin.slice(Math.max(0, tagsWithin.size - 3), tagsWithin.size).toString);
       feat("DescriptorHead=" + queryDescriptor + "-" + binSize(querySize) + "-" + ment.headStringLc);
+      for(f <- query.features)
+        feat(f)
+      feats.toArray;
+    });
+  }
+
+  def getDentationLinksSets(denotations: Seq[String], wikiDB: WikipediaInterface) : (Seq[Set[Int]], Seq[Set[Int]]) = {
+    (denotations.map(wikiDB.linksDB.getInLinksSetUseCache(_)), denotations.map(wikiDB.linksDB.getOutLinksSetUseCache(_)))
+  }
+
+  val logsv = (0 until 3000).map(Math.log(_))
+
+  def logs(i: Int) = {
+    if(i < logsv.size)
+      logsv(i)
+    else
+      Math.log(i)
+  }
+
+  def unionSize[T](ss: Set[T]*) = {
+    val ns = new mutable.HashSet[T]()
+    for(s <- ss) {
+      ns ++= s
+    }
+    ns.size
+  }
+
+  def intersectSize[T](a: Set[T], b: Set[T]) = {
+    var smaller: Set[T] = a
+    var larger: Set[T] = b
+    if(a.size > b.size) {
+      larger = a
+      smaller = b
+    }
+    var ret = 0
+    for(i <- smaller) {
+      if(larger.contains(i))
+        ret += 1
+    }
+    ret
+  }
+
+  /*def NGD[T](a: Set[T], b: Set[T], wsize: Int) : Double = {
+    (logs(math.max(a.size, b.size)) - logs(intersectSize(a,b))) /
+      (logs(wsize) - logs(math.min(a.size,b.size)))
+  }
+
+  def PMI[T](a: Set[T], b: Set[T], wsize: Int) : Double = {
+    // TODO: ? the use of wsize here does not make since
+    // must be misunderstanding something
+    (intersectSize(a,b) * wsize).asInstanceOf[Float] / (a.size * b.size)
+  }
+
+  def GLOWfeatures[T](fn: (Set[T], Set[T], Int) => Double, refs: Seq[Set[T]], prefix: String): Seq[Array[String]] = {
+    val rsize = refs.size
+    val wsize = unionSize(refs:_*)
+    var max = Double.NegativeInfinity
+    var avg = 0.0
+    // TODO: rank the items in the list
+    //val valList = new mutable.MutableList[Double]()
+    val cache = new mutable.HashMap[Int,Double] {
+      override def initialSize: Int = rsize*rsize
+    }
+    for(a <- 0 until rsize; b <- 0 until rsize) {
+      if(a != b) {
+        val v = fn(refs(a), refs(b), wsize)
+        cache.put(a + b*65536, v)
+        if(v > max)
+          max = v
+        //valList += v
+        avg += v
+      }
+    }
+    avg /= (rsize * (rsize - 1))
+    for(a <- 0 until rsize) yield {
+      var isInMax = false
+      var isAboveAvg = false
+      var isAboveAvg2 = false
+      for(b <- 0 until rsize) {
+        if(a != b) {
+          //val v = fn(refs(a),refs(b),wsize)
+          val v : Double = cache.getOrElse(a + b*65536, 0.0)
+          if(v == max) {
+            isInMax = true
+          }
+          if(v > avg) {
+            isAboveAvg = true
+          }
+          if(v > (avg * 2)) {
+            isAboveAvg2 = true
+          }
+        }
+      }
+      val r = new ArrayBuffer[String]
+      if(isInMax)
+        r += prefix + "IsInMax"
+      if(isAboveAvg)
+        r += prefix + "isAboveAvg"
+      if(isAboveAvg2)
+        r += prefix + "isAboveAvg2"
+      r.toArray
+    }
+  }*/
+
+  def featurizeQueriesAndDenotations_GLOW(queries: Seq[Query], denotations: Seq[String], addToIndexer: Boolean, wikiDB: WikipediaInterface): Array[Array[Array[Int]]] = {
+    val queryOutcomes = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
+    val queryNonemptyList = queryOutcomes.map(_.isEmpty);
+    val ment = queries.head.originalMent;
+    val mentUpToHeadSize = ment.headIdx - ment.startIdx + 1;
+    /*val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB)
+
+    val PMINGDvals = Seq(
+      GLOWfeatures[Int](PMI, refLinksIn, "PMI-in-"),
+      GLOWfeatures[Int](NGD, refLinksIn, "NGD-in-"),
+      GLOWfeatures[Int](PMI, refLinksOut, "PMI-out-"),
+      GLOWfeatures[Int](NGD, refLinksOut, "NGD-out-")
+    )*/
+
+    // TODO: this is not correct,.....
+    // we need to know what we are going to annonate stuff in the document with,
+    // these are going to be denotations for a single example, which won't be useful
+    // so we need to get all the possible annontations for a given document
+    //
+    // in the wikification paper they have something that is choosing the references together
+    // need to look at pairs of references and
+
+    val denotationSim = denotations.map(t => wikiDB.textDB.compareDocumentC(ment.rawDoc.documentVectorCache, t))
+    val denotationSimMax = denotationSim.max
+    val denotationSimAvg = denotationSim.sum / denotationSim.size
+
+    // TODO: implement the local vector features which compare the text of the pages
+    // the context can be the set of items linking into/outof a page? but then that isn't the similarity
+
+
+
+    Array.tabulate(queries.size, denotations.size)((queryIdx, denIdx) => {
+      val feats = new ArrayBuffer[Int];
+      def feat(str: String) = addFeat(str, feats, addToIndexer);
+      /*for(p <- PMINGDvals)
+        for(f <- p(denIdx))
+          feat(f)
+      */
+      val query = queries(queryIdx);
+      val den = denotations(denIdx);
+      if (den == NilToken) {
+        feat("NilAndQueryNonempty=" + queryNonemptyList(queryIdx));
+      } else if (queryOutcomes(queryIdx).containsKey(den)) {
+        val queryDescriptorWithProper = (if (ment.pos(ment.headIdx - ment.startIdx) == "NNP") "PROP" else "NOM") + "-" + query.queryType;
+        val queryRank = queryOutcomes(queryIdx).getSortedKeys().indexOf(den);
+        feat("Rank=" + queryDescriptorWithProper + "-" + (queryRank + 1))
+        val queryStr = query.getFinalQueryStr;
+        val matchesQuery = den.toLowerCase == queryStr.toLowerCase;
+        feat("MatchesQuery=" + queryDescriptorWithProper + "-" + matchesQuery)
+        if (!matchesQuery) {
+          feat("ContainsQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.contains(queryStr.toLowerCase)));
+          feat("StartsWithQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.startsWith(queryStr.toLowerCase)));
+          feat("EndsWithQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.endsWith(queryStr.toLowerCase)));
+        }
+        val denotationHasParenthetical = den.contains("(") && den.endsWith(")");
+        feat("ContainsParenthetical=" + queryDescriptorWithProper + "-" + denotationHasParenthetical);
+        if (denotationHasParenthetical) {
+          feat("MatchesQueryUpToParen=" + queryDescriptorWithProper + "-" + (den.substring(0, den.indexOf("(")).trim.toLowerCase == queryStr.toLowerCase))
+        }
+        feat("CompariableWordsLog="+Math.floor(Math.log(denotationSim(denIdx))))
+        feat("CompariableIsMaxWordSim=" + (denotationSim(denIdx) == denotationSimMax))
+        feat("CompariableWordsAboveAvg=" + (denotationSim(denIdx) > denotationSimAvg))
+        feat("CompariableWordsReweight="+Math.floor(denotationSim(denIdx) / denotationSimMax * 10))
+      } else {
+        feat("Impossible");
+      }
       feats.toArray;
     });
   }
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
index cdb1566..c4ad61f 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
@@ -1,13 +1,14 @@
 package edu.berkeley.nlp.entity.wiki
 
-import edu.berkeley.nlp.entity.Chunk
+import edu.berkeley.nlp.entity.{Document, Chunk, GUtil}
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.futile.util.Counter
 import scala.collection.JavaConverters._
 import edu.berkeley.nlp.entity.joint.JointDocACE
 import java.io.PrintWriter
 
+import scala.collection.mutable.ArrayBuffer
+
 object WikificationEvaluator {
   
   def removeExcludes(chunks: Seq[Chunk[String]]) = chunks.filter(chunk => chunk.label != ExcludeToken)
@@ -78,6 +79,48 @@ object WikificationEvaluator {
     }
     Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom));
   }
+
+
+  // create sets of all the gold document references, and all the documents
+  // that we generate, and then compute an F1
+  def evaluateBOTF1_mfl(results : Map[Document, Seq[(Seq[String], Seq[String], Document)]]) = {
+    // f1 = 2 * precision * recall / (percison + recall)
+    var correct = 0
+    var precDenom = 0
+    var recDenom = 0
+    for((doc, matches) <- results) {
+      var seenBefore = Set[String]()
+      val allGold = Set(matches.flatMap(_._1):_*)
+      val allChoosen = Set(matches.map(_._2(0)):_*) //Set(matches.flatMap(_._2):_*)
+
+      /*for((gold, selected, _) <- matches) {
+        val goldS = Set(gold:_*)
+        val selectedS = Set(selected(0)) //Set(selected:_*)
+        val ints = goldS & selectedS
+        //if(!ints.subsetOf(seenBefore)) {
+          correct += ints.size
+          seenBefore ++= ints
+        //}
+      }*/
+      // TODO: something wrong with computing the set intersection
+
+      val dprecDenom = allChoosen.size
+      val drecDenom = allGold.size
+      var dcorrect = 0
+      allChoosen.foreach(c => {
+        if(isCorrect(allGold.toSeq, c))
+          dcorrect += 1
+      })
+      //val diff = (allGold ++ allChoosen) -- (allGold & allChoosen)
+      //val dcorrect = (allGold & allChoosen).size
+      //Logger.logss("Document f1: "+GUtil.renderPRF1(dcorrect, dprecDenom, drecDenom))
+      precDenom += dprecDenom
+      recDenom += drecDenom
+      correct += dcorrect
+    }
+    Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom))
+  }
+
   
   def convertChunksToBagOfTitles(titles: Iterable[Seq[Chunk[String]]]): Set[String] = {
     val bagOfTitles = titles.flatMap(sentTitles => {
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index 88f9ff2..d87fbfe 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -5,9 +5,8 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
-import edu.berkeley.nlp.entity.ConllDocReader
+import edu.berkeley.nlp.entity.{WikiDocReader, ConllDocReader, GUtil}
 import edu.berkeley.nlp.entity.coref.CorefDocAssembler
-import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.entity.coref.Mention
 import edu.berkeley.nlp.entity.coref.MentionPropertyComputer
 import edu.berkeley.nlp.entity.lang.Language
@@ -27,7 +26,7 @@ import edu.berkeley.nlp.entity.wiki._
  * java -cp /path/to/jar -Xmx8g edu.berkeley.nlp.entity.wiki.WikipediaInterface \
  *  -datasetPaths path/to/test-docs-directory-one-doc-per-file,path/to/additional/docs,... \
  *  -wikipediaDumpPath path/to/enwiki-latest-pages-articles.xml
- *  -outputDir path/to/output-file.ser.gz
+ *  -outputPath path/to/output-file.ser.gz
  *
  * Required arguments:
  * -datasetPaths: pointer to CoNLL-formatted files whose mentions we should extract
@@ -57,7 +56,8 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
                          val redirectsDB: WikipediaRedirectsDB,
                          val categoryDB: WikipediaCategoryDB,
                          val linksDB: WikipediaLinkDB,
-                         val auxDB: WikipediaAuxDB) extends Serializable {
+                         val auxDB: WikipediaAuxDB,
+                         val textDB: WikipediaTextDB) extends Serializable {
   
   def getStandardPriorForJointModel(ment: Mention) = {
     val counter = new Counter[String];
@@ -75,7 +75,9 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
   def disambiguate(ment: Mention) = disambiguateBest(ment, ment.headIdx)
   
   def disambiguateBest(ment: Mention, specifiedHeadIdx: Int) = {
-    redirectsDB.followRedirect(titleGivenSurfaceDB.disambiguateQueries(Query.extractQueriesBest(ment).map(_.getFinalQueryStr)));
+    redirectsDB.followRedirect(
+      titleGivenSurfaceDB.disambiguateQueries(
+        Query.extractQueriesBest(ment).map(_.getFinalQueryStr)));
   }
   
   def disambiguateBestNoDisambig(query: Query) = {
@@ -95,19 +97,51 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
   }
   
   def disambiguateBestGetAllOptions(ment: Mention, specifiedHeadIdx: Int) = {
-    auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
+    auxDB.purgeDisambiguationAll(
+      redirectsDB.followRedirectsCounter(
+        titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(
+          Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
   }
   
   def disambiguateBestGetAllOptions(query: Query) = {
-    auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(query.getFinalQueryStr))));
+    auxDB.purgeDisambiguationAll(
+      redirectsDB.followRedirectsCounter(
+        titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(
+          Seq(query.getFinalQueryStr))));
   }
-  
+
+  def merge[T](a: Counter[T], b: Counter[T]) = {
+    for(k <- a.keySet().asScala) {
+      b.incrementCount(k, a.getCount(k))
+    }
+  }
+
+  def disambigRes(query: Query) = {
+    val str = query.getFinalQueryStr
+    var titles = titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(str))
+    titles.incrementCount(str, 1.0)
+    var redirs = redirectsDB.followRedirectsCounter(titles)
+    merge(titles, redirs)
+    //var aux = auxDB.purgeDisambiguationAll(redirs)
+    //merge(redirs, aux)
+    //aux
+    redirs
+  }
+
+
+
   def disambiguateBestGetAllReasonableOptions(ment: Mention, specifiedHeadIdx: Int) = {
-    auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllReasonableOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
+    auxDB.purgeDisambiguationAll(
+      redirectsDB.followRedirectsCounter(
+        titleGivenSurfaceDB.disambiguateQueriesGetAllReasonableOptions(
+          Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
   }
   
   def disambiguateBestGetAllOneBestOptions(ment: Mention, specifiedHeadIdx: Int) = {
-    auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOneBestOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
+    auxDB.purgeDisambiguationAll(
+      redirectsDB.followRedirectsCounter(
+        titleGivenSurfaceDB.disambiguateQueriesGetAllOneBestOptions(
+          Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
   }
   
   def getCategories(title: String) = categoryDB.getCategories(title);
@@ -160,6 +194,8 @@ object WikipediaInterface {
   
   val categoryDBInputPath = "";
   val categoryDBOutputPath = "";
+
+  val wikiStandoff = "";
   
   def processWikipedia(wikipediaPath: String, queries: Set[String], parser: CoarseToFineMaxRuleParser, backoffParser: CoarseToFineMaxRuleParser): WikipediaInterface = {
     val titleGivenSurface = WikipediaTitleGivenSurfaceDB.processWikipedia(wikipediaPath, queries);
@@ -168,11 +204,12 @@ object WikipediaInterface {
     val links = if (WikipediaInterface.computeLinkDB) {
       WikipediaLinkDB.processWikipedia(wikipediaPath, allPageTargetsLc);
     } else {
-      new WikipediaLinkDB(new Indexer[String], new HashMap[String,Array[Int]], new HashMap[String,Array[Int]]);
+      new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]);
     }
     val categories = WikipediaCategoryDB.processWikipedia(wikipediaPath, allPageTargetsLc, parser, backoffParser);
     val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc);
-    val wi = new WikipediaInterface(titleGivenSurface, redirects, categories, links, aux);
+    val texts = WikipediaTextDB.processWikipedia(wikipediaPath, allPageTargetsLc);
+    val wi = new WikipediaInterface(titleGivenSurface, redirects, categories, links, aux, texts);
     wi.printSome();
     wi;
   }
@@ -184,10 +221,11 @@ object WikipediaInterface {
     val links = if (WikipediaInterface.computeLinkDB) {
       WikipediaLinkDB.processWikipedia(wikipediaPath, allPageTargetsLc);
     } else {
-      new WikipediaLinkDB(new Indexer[String], new HashMap[String,Array[Int]], new HashMap[String,Array[Int]]);
+      new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]);
     }
     val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc);
-    val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux);
+    val texts = WikipediaTextDB.processWikipedia(wikipediaPath, allPageTargetsLc);
+    val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux, texts);
     wi.printSome();
     wi;
   }
@@ -211,22 +249,59 @@ object WikipediaInterface {
     val mentionPropertyComputer = new MentionPropertyComputer(None);
     val pmAssembler = CorefDocAssembler(Language.ENGLISH, useGoldMentions = false);
     val gmAssembler = CorefDocAssembler(Language.ENGLISH, useGoldMentions = true);
-    val corefDocs = WikipediaInterface.datasetPaths.split(",").flatMap(path => {
-      if (WikipediaInterface.mentionType == "old") {
+    val corefDocs = WikipediaInterface.datasetPaths.split(",").flatMap(path_ => {
+      var path = path_
+      val mentionType = if(path.contains(":")) {
+        val s = path.split(":")
+        path = s(1)
+        s(0)
+      } else {
+        WikipediaInterface.mentionType
+      }
+      Logger.logss("Loading documents "+mentionType+" "+path)
+      if (mentionType == "old") {
         // Wikification dataset: use only auto_conll and pred mentions
         ConllDocReader.loadRawConllDocsWithSuffix(path, -1, "", Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer));
-      } else if (WikipediaInterface.mentionType == "ace") {
+      } else if (mentionType == "ace") {
         // ACE: Use gold mentions here
         ConllDocReader.loadRawConllDocsWithSuffix(path, -1, "", Language.ENGLISH).map(doc => gmAssembler.createCorefDoc(doc, mentionPropertyComputer));
-      } else if (WikipediaInterface.mentionType == "ontonotes") {
+      } else if (mentionType == "ontonotes") {
         // OntoNotes: use only auto_conll and pred mentions
         ConllDocReader.loadRawConllDocsWithSuffix(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer));
+      } else if (mentionType == "wikiser") {
+        WikiDocReader.loadRawWikiDocs(path, -1, docSuffix, Language.ENGLISH).map(doc => {
+          try {
+            gmAssembler.createCorefDoc(doc, mentionPropertyComputer)
+          } catch {
+            case e : Exception => {
+              // there are currently about 30 documents that are having an issue with their references
+              println("FAIL DOCUMENT: "+doc.docID)
+              null
+            }
+          }
+        })
       } else {
         throw new RuntimeException("Unrecognized mention type: " + WikipediaInterface.mentionType);
       }
-    });
+    }).filter(_!=null);
 //    val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet;
-    val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr)).toSet;
+
+    // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents.
+    var queries = corefDocs.flatMap(_.predMentions/*.filter(!_.mentionType.isClosedClass)*/)
+      .flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr))
+      .toSet;
+    // some of the gold titles in the older dataset link to current redirect pages
+    // so we are loading them here so we can normalize the redirects when performing training/testing
+    val golds : Set[String] = if(!wikiStandoff.isEmpty) {
+      WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiStandoff).flatMap(d => {
+        d._2.flatMap(v => {
+          v._2.flatMap(_.label).map(_.replace("_"," "))
+        })
+      }).toSet
+    } else {
+      Set[String]()
+    }
+    queries = queries ++ golds
     Logger.logss("Extracted " + queries.size + " queries from " + corefDocs.size + " documents");
     val interface = if (WikipediaInterface.categoryDBInputPath != "") {
       val categoryDB = GUtil.load(WikipediaInterface.categoryDBInputPath).asInstanceOf[WikipediaCategoryDB];
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
index cdcb894..d2b00cb 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
@@ -1,6 +1,7 @@
 package edu.berkeley.nlp.entity.wiki
 
 import edu.berkeley.nlp.futile.fig.basic.Indexer
+import scala.collection.mutable
 import scala.collection.mutable.HashMap
 import scala.collection.mutable.ArrayBuffer
 import edu.berkeley.nlp.futile.fig.basic.IOUtils
@@ -14,33 +15,73 @@ import edu.berkeley.nlp.entity.lang.Language
 import edu.berkeley.nlp.entity.wiki._
 
 @SerialVersionUID(9084163557546777842L)
-class WikipediaLinkDB(val pageNameIndex: Indexer[String],
-                      val inLinksMap: HashMap[String,Array[Int]],
-                      val outLinksMap: HashMap[String,Array[Int]]) extends Serializable {
-  var outLinksSetCache: HashMap[String,Set[Int]] = null;
-  
+class WikipediaLinkDB(private val pageNameIndex: Indexer[String],
+                      private val inLinksMap: HashMap[Int,Array[Int]],
+                      private val outLinksMap: HashMap[Int,Array[Int]]) extends Serializable {
+  @transient
+  private var outLinksSetCache : mutable.HashMap[String,Set[Int]] = null
+
+  @transient
+  private var inLinksSetCache : mutable.HashMap[String,Set[Int]] = null
+
   def getOutLinks(title: String) = {
-    if (outLinksMap.contains(title)) {
-      outLinksMap(title);
+    val k = pageNameIndex.indexOf(title)
+    if (outLinksMap.contains(k)) {
+      outLinksMap(k);
     } else {
       Array[Int]();
     }
   }
-  
-  def getOutLinksSetUseCache(title: String) = {
-    if (outLinksMap.contains(title)) {
-      if (outLinksSetCache == null) {
-        outLinksSetCache = new HashMap[String,Set[Int]];
+
+  def getInLinks(title: String) = {
+    val k = pageNameIndex.indexOf(title)
+    if(inLinksMap.contains(k)) {
+      inLinksMap(k)
+    } else {
+      Array[Int]()
+    }
+  }
+
+  def getInLinksSetUseCache(title: String) : Set[Int] = {
+    if(inLinksSetCache == null) {
+      inLinksSetCache = new mutable.HashMap[String,Set[Int]]()
+    }
+    if(inLinksSetCache.contains(title)) {
+      inLinksSetCache(title)
+    } else {
+      val k = pageNameIndex.indexOf(title)
+      if(k != -1) {
+        if (inLinksSetCache.size > 1000) {
+          inLinksSetCache = new mutable.HashMap[String,Set[Int]]()
+        }
+        val s = inLinksMap.getOrElse(k, Array[Int]()).toSet
+        inLinksSetCache.put(title, s)
+        s
+      } else {
+        Set[Int]()
       }
-      if (!outLinksSetCache.contains(title)) {
+    }
+  }
+
+  def getOutLinksSetUseCache(title: String) : Set[Int] = {
+    if(outLinksSetCache == null) {
+      outLinksSetCache = new mutable.HashMap[String,Set[Int]]()
+    }
+    if(outLinksSetCache.contains(title)) {
+      outLinksSetCache(title)
+    } else {
+      val k = pageNameIndex.indexOf(title)
+      if(k != -1) {
         if (outLinksSetCache.size > 1000) {
-          outLinksSetCache.dropRight(1);
+          // dropping one item was taking too long
+          outLinksSetCache = new mutable.HashMap[String,Set[Int]]()
         }
-        outLinksSetCache.put(title, outLinksMap(title).toSet);
+        val s = outLinksMap.getOrElse(k, Array[Int]()).toSet
+        outLinksSetCache.put(title, s)
+        s
+      } else {
+        Set[Int]()
       }
-      outLinksSetCache(title);
-    } else {
-      Set[Int]();
     }
   }
   
@@ -56,9 +97,11 @@ class WikipediaLinkDB(val pageNameIndex: Indexer[String],
   }
   
   def doesOneLinkToOther(title1: String, title2: String): Boolean = {
+    val ti1 = pageNameIndex.indexOf(title1)
+    val ti2 = pageNameIndex.indexOf(title2)
     val outLinksTitle1 = getOutLinks(title1);
     val outLinksTitle2 = getOutLinks(title2);
-    outLinksTitle1.contains(pageNameIndex.indexOf(title2)) || outLinksTitle2.contains(pageNameIndex.indexOf(title1)) 
+    outLinksTitle1.contains(ti2) || outLinksTitle2.contains(ti1)
   }
 }
 
@@ -66,18 +109,19 @@ object WikipediaLinkDB {
   
   def processWikipedia(wikipediaPath: String, pageTitleSetLc: Set[String]): WikipediaLinkDB = {
     val pageNamesIndex = new Indexer[String];
-    val inLinksMap = new HashMap[String,HashSet[Int]];
-    val outLinksMap = new HashMap[String,HashSet[Int]];
+    val inLinksMap = new HashMap[Int,HashSet[Int]];
+    val outLinksMap = new HashMap[Int,HashSet[Int]];
     val lines = IOUtils.lineIterator(IOUtils.openInHard(wikipediaPath));
     var currentPageTitle = "";
-    var linksThisPage = new StringBuilder();
+    var currentPageTitleind = 0
+    //var linksThisPage = new StringBuilder();
     var doneWithThisPage = false;
     var numPagesSeen = 0;
     var lineIdx = 0;
-    var isInText = false;
-    val categoryMap = new HashMap[String,ArrayBuffer[String]];
-    val infoboxMap = new HashMap[String,String];
-    val appositiveMap = new HashMap[String,String];
+    //var isInText = false;
+    //val categoryMap = new HashMap[String,ArrayBuffer[String]];
+    //val infoboxMap = new HashMap[String,String];
+    //val appositiveMap = new HashMap[String,String];
     // Extract first line that's not in brackets
     while (lines.hasNext) {
       val line = lines.next;
@@ -96,10 +140,16 @@ object WikipediaLinkDB {
         } else if (line.contains("<title>")) {
           // 7 = "<title>".length()
           currentPageTitle = line.substring(line.indexOf("<title>") + 7, line.indexOf("</title>"));
+          currentPageTitleind = pageNamesIndex.getIndex(currentPageTitle)
           if (!pageTitleSetLc.contains(currentPageTitle.toLowerCase)) {
             doneWithThisPage = true;
           }
         } else if (line.contains("<redirect title")) {
+          val linkDest = line.substring(line.indexOf("title=\"") + 7, line.indexOf("\" />"))
+          val idx = pageNamesIndex.getIndex(linkDest)
+          val hs = new HashSet[Int]
+          hs.add(idx)
+          outLinksMap.put(currentPageTitleind, hs)
           doneWithThisPage = true;
         }
         var startIdx = line.indexOf("[[");
@@ -115,16 +165,24 @@ object WikipediaLinkDB {
           }
           if (linkDest != "") {
             val idx = pageNamesIndex.getIndex(linkDest);
-            if (!outLinksMap.contains(currentPageTitle)) {
-              outLinksMap.put(currentPageTitle, new HashSet[Int]);
+            if (!outLinksMap.contains(currentPageTitleind)) {
+              outLinksMap.put(currentPageTitleind, new HashSet[Int]);
             }
-            outLinksMap(currentPageTitle) += idx;
+            outLinksMap(currentPageTitleind) += idx;
           }
           startIdx = line.indexOf("[[", startIdx + 2);
         }
       }
     }
-    val inLinksMapArrs = inLinksMap.map(entry => entry._1 -> entry._2.toArray);
+    outLinksMap.foreach(a => {
+      a._2.foreach(b => {
+        if(!inLinksMap.contains(b)) {
+          inLinksMap.put(b, new mutable.HashSet[Int])
+        }
+        inLinksMap(b) += a._1
+      })
+    })
+    val inLinksMapArrs = inLinksMap.map(entry => entry._1 -> entry._2.toArray); // TODO: WTF: inlinksmap is never written to
     val outLinksMapArrs = outLinksMap.map(entry => entry._1 -> entry._2.toArray);
     val sizes = Array.tabulate(10)(i => 0);
     for (key <- outLinksMapArrs.keySet) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala
index 654f8fa..dbc4e88 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala
@@ -48,6 +48,7 @@ object WikipediaRedirectsDB {
   val CapitalizeInitial = true;
   
   def removeWeirdMarkup(str: String) = {
+    // TODO: this is a slow method, don't use
     str.replace("&#039;", "'");
   }
   
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
new file mode 100644
index 0000000..aca157a
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
@@ -0,0 +1,129 @@
+package edu.berkeley.nlp.entity.wiki
+
+import edu.berkeley.nlp.futile.fig.basic.{IOUtils, Indexer}
+import edu.berkeley.nlp.futile.util.Counter
+
+import scala.collection.JavaConversions._
+
+
+import scala.StringBuilder
+import scala.collection.mutable
+
+/**
+ * Created by matthewfl
+ *
+ * Provide bow counts for documents so we can compute the similarity between two documents
+ */
+@SerialVersionUID(1L)
+class WikipediaTextDB (val indexer: Indexer[String], val words: mutable.HashMap[String, Array[Int]]) extends Serializable {
+
+  def getDocument(title: String) = words.getOrElse(title, Array[Int]())
+
+  def compareVectors(a: Array[Int], b: Array[Int]) = {
+    var ai = 0
+    var bi = 0
+    var simcnt = 0
+    while(ai < a.size && bi < b.size) {
+      if(a(ai) == b(bi)) {
+        simcnt += 1
+        ai += 1
+        bi += 1
+      } else if(a(ai) > b(bi)) {
+        bi += 1
+      } else {
+        ai += 1
+      }
+    }
+    simcnt
+  }
+
+  def compareTitles(atitle: String, btitle: String) = compareVectors(getDocument(atitle), getDocument(btitle))
+
+  def makeVector(document: Seq[Seq[String]]) = {
+    document.flatMap(_.map(v => indexer.indexOf(v.toLowerCase))).toSet.filter(_ != -1).toArray.sorted
+  }
+
+  def compareDocument(doc: Array[Int], title: String) = compareVectors(doc, getDocument(title))
+
+  def compareDocumentC(doc: Array[Int], title: String) = {
+    val tdoc = getDocument(title)
+    compareVectors(doc, tdoc).asInstanceOf[Double] / (doc.size * tdoc.size)
+  }
+
+}
+
+object WikipediaTextDB {
+  def processWikipedia(wikipediaPath:String, querySet: Set[String]) : WikipediaTextDB = {
+    val lines = IOUtils.lineIterator(IOUtils.openInHard(wikipediaPath));
+    var currentPageTitle: String = null
+    val indexer = new Indexer[String]
+    val totalWordCounts = new Counter[Int]
+    var currentWordCounts = new mutable.HashSet[Int]
+    val documentResults = new mutable.HashMap[String,Array[Int]]
+    var lineIdx = 0
+    var numPagesSeen = 0
+    var doneWithThisPage = false
+
+    while(lines.hasNext) {
+      val line = lines.next
+      if (lineIdx % 100000 == 0) {
+        println("Line: " + lineIdx + ", processed " + numPagesSeen + " pages");
+      }
+      lineIdx += 1;
+      if (line.size > 8 && doneWithThisPage) {
+        // Do nothing
+      } else {
+        if(line.contains("<page>")) {
+          doneWithThisPage = false
+          numPagesSeen += 1
+        } else if (line.contains("<title>")) {
+          // 7 = "<title>".length()
+          val newPageTitle = line.substring(line.indexOf("<title>") + 7, line.indexOf("</title>"));
+          if (!querySet.contains(newPageTitle.toLowerCase)) {
+            doneWithThisPage = true;
+          } else {
+            if(currentPageTitle != null) {
+              documentResults += (currentPageTitle -> currentWordCounts.toArray)
+            }
+            currentWordCounts = new mutable.HashSet[Int]()
+            currentPageTitle = newPageTitle
+          }
+        } else if(line.contains("<text")) {
+          val textStart = line.indexOf(">") + 1
+          val document = new StringBuilder()
+          var textEnd = line.indexOf("</text>")
+          if(textEnd != -1) {
+            document.append(line.substring(textStart, textEnd))
+          } else {
+            var curLine = line.substring(textStart)
+            while(textEnd == -1) {
+              document.append(curLine)
+              curLine = lines.next
+              textEnd = curLine.indexOf("</text>")
+            }
+            document.append(curLine.substring(0, textEnd))
+          }
+          // TODO: maybe toSet
+          document.toString.split("[^A-Za-z]").foreach(w => {
+            val i = indexer.getIndex(w.toLowerCase)
+            totalWordCounts.incrementCount(i, 1.0)
+            currentWordCounts += i
+            //currentWordCounts.incrementCount(i, 1.0)
+          })
+        }
+      }
+    }
+
+    // get the 300 most common words and remove them from all the documents
+    val wrdsq = totalWordCounts.asPriorityQueue
+    val removeWords = new mutable.HashSet[Int]()
+    for(i <- 0 until 300; if wrdsq.hasNext)
+      removeWords += wrdsq.next
+    for(k <- documentResults) {
+      documentResults(k._1) = k._2.filter(!removeWords.contains(_)).sorted
+    }
+
+
+    new WikipediaTextDB(indexer, documentResults)
+  }
+}
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala
index 2445259..deaf7d4 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala
@@ -12,6 +12,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 
 // Need to know all titles (including redirects)
+// determins what surfaces values link to with a given count
 @SerialVersionUID(1L)
 class WikipediaTitleGivenSurfaceDB(val surfaceToTitle: CounterMap[String,String]) extends Serializable {
   val truecaseMap = new HashMap[String,ArrayBuffer[String]];
@@ -89,7 +90,9 @@ object WikipediaTitleGivenSurfaceDB {
   val PuncList = Set(',', '.', '!', '?', ':', ';', '\'', '"', '(', ')', '[', ']', '{', '}', ' ');
 
   def isGoodTitle(str: String) = !str.contains("#") && !str.contains(":") && !str.contains("Wikipedia") && !str.startsWith("List of") && !str.startsWith("List_of");
-  
+
+  // this is using the set of generated queries to determine which are the best items to extract
+  // / limit the size of the extracted wiki data
   def processWikipedia(wikipediaPath: String, querySet: Set[String]): WikipediaTitleGivenSurfaceDB = {
     val lowercase = false;
     val surfaceToTitle = new CounterMap[String,String];
@@ -129,13 +132,13 @@ object WikipediaTitleGivenSurfaceDB {
     Logger.logss(querySet.size + " queries, " + counter + " lines processed, " + surfaceToTitle.size + " surface strings found, " +
                  surfaceToTitle.totalCount + " total count");
     // .toSeq here to avoid a ConcurrentModificationException
-    for (key <- surfaceToTitle.keySet.asScala.toSeq) {
+    /*for (key <- surfaceToTitle.keySet.asScala.toSeq) {
       surfaceToTitle.getCounter(key).pruneKeysBelowThreshold(1.5);
       surfaceToTitle.getCounter(key).removeKey("");
       if (surfaceToTitle.getCounter(key).isEmpty) {
         surfaceToTitle.removeKey(key);
       }
-    }
+    }*/
     new WikipediaTitleGivenSurfaceDB(surfaceToTitle);
   }