diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..99fe3d9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+berkeley-entity-models.tgz
+data.tgz
+data/
+expers/
+models/
+project/project/
+project/target/
+target/
+specify_execDir/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..61b103e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,17 @@
+# some random useful functions
+
+TARGET = target/scala-2.11/berkeley-entity-assembly-1.jar
+
+all: $(TARGET)
+
+$(TARGET): $(wildcard src/**/*)
+ sbt assembly
+
+aceTester: $(TARGET)
+ java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.ACETester -dataPath data/ace05/ace05-all-conll
+
+queryModel: $(TARGET)
+ java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.QueryChooser -wikiDBPath models/wiki-db-ace.ser.gz
+
+wikiLimited: $(TARGET)
+ java -cp $(TARGET) edu.berkeley.nlp.entity.preprocess.PreprocessingDriver ++config/base.conf -inputDir ../WikificationACL2011Data/WikipediaSample/RawTextsTrain/ -outputDir /tmp/gggg/raw/ -mode WIKILIMITED
diff --git a/build.sbt b/build.sbt
index 91a4b9b..77738fd 100644
--- a/build.sbt
+++ b/build.sbt
@@ -4,9 +4,10 @@ name := "berkeley-entity"
version := "1"
-scalaVersion := "2.11.2"
+scalaVersion := "2.11.6"
assemblySettings
mainClass in assembly := Some("edu.berkeley.nlp.entity.Driver")
+unmanagedResourceDirectories in Compile += { baseDirectory.value / "resources/" }
diff --git a/resources/Messages_de.properties b/resources/Messages_de.properties
new file mode 100644
index 0000000..51b38e9
--- /dev/null
+++ b/resources/Messages_de.properties
@@ -0,0 +1,9 @@
+wiki.tags.toc.content=Inhaltsverzeichnis
+wiki.api.url=http://de.wikipedia.org/w/api.php
+wiki.api.category1=Kategorie
+wiki.api.image1=Datei
+wiki.api.image2=Bild
+wiki.api.template1=Vorlage
+wiki.api.category2=Category
+wiki.api.image2=Image
+wiki.api.template2=Template
\ No newline at end of file
diff --git a/resources/Messages_en.properties b/resources/Messages_en.properties
new file mode 100644
index 0000000..6b9e2f0
--- /dev/null
+++ b/resources/Messages_en.properties
@@ -0,0 +1,36 @@
+wiki.tags.toc.content=Contents
+wiki.api.url=http://en.wikipedia.org/w/api.php
+wiki.api.media1=Media
+wiki.api.media2=Media
+wiki.api.special1=Special
+wiki.api.special2=Special
+wiki.api.talk1=Talk
+wiki.api.talk2=Talk
+wiki.api.user1=User
+wiki.api.user2=User
+wiki.api.usertalk1=User_talk
+wiki.api.usertalk2=User_talk
+wiki.api.meta1=Meta
+wiki.api.meta2=Meta
+wiki.api.metatalk1=Meta_talk
+wiki.api.metatalk2=Meta_talk
+wiki.api.image1=Image
+wiki.api.image2=File
+wiki.api.imagetalk1=Image_talk
+wiki.api.imagetalk2=File_talk
+wiki.api.mediawiki1=MediaWiki
+wiki.api.mediawiki2=MediaWiki
+wiki.api.mediawikitalk1=MediaWiki_talk
+wiki.api.mediawikitalk2=MediaWiki_talk
+wiki.api.template1=Template
+wiki.api.template2=Template
+wiki.api.templatetalk1=Template_talk
+wiki.api.templatetalk2=Template_talk
+wiki.api.help1=Help
+wiki.api.help2=Help
+wiki.api.helptalk1=Help_talk
+wiki.api.helptalk2=Help_talk
+wiki.api.category1=Category
+wiki.api.category2=Category
+wiki.api.categorytalk1=Category_talk
+wiki.api.categorytalk2=Category_talk
\ No newline at end of file
diff --git a/resources/Messages_es.properties b/resources/Messages_es.properties
new file mode 100644
index 0000000..bc50428
--- /dev/null
+++ b/resources/Messages_es.properties
@@ -0,0 +1,8 @@
+wiki.tags.toc.content=Contenido
+wiki.api.url=http://es.wikipedia.org/w/api.php
+wiki.api.category1=Categor\u00EDa
+wiki.api.image1=Imagen
+wiki.api.template1=Plantilla
+wiki.api.category2=Category
+wiki.api.image2=Image
+wiki.api.template2=Template
diff --git a/resources/Messages_fr.properties b/resources/Messages_fr.properties
new file mode 100644
index 0000000..2a76842
--- /dev/null
+++ b/resources/Messages_fr.properties
@@ -0,0 +1,8 @@
+wiki.tags.toc.content=Sommaire
+wiki.api.url=http://fr.wikipedia.org/w/api.php
+wiki.api.category1=Cat\u00E9gorie
+wiki.api.image1=Image
+wiki.api.template1=Mod\u00E8le
+wiki.api.category2=Category
+wiki.api.image2=Image
+wiki.api.template2=Template
\ No newline at end of file
diff --git a/resources/Messages_it.properties b/resources/Messages_it.properties
new file mode 100644
index 0000000..97778a3
--- /dev/null
+++ b/resources/Messages_it.properties
@@ -0,0 +1,8 @@
+wiki.tags.toc.content=Indice
+wiki.api.url=http://it.wikipedia.org/w/api.php
+wiki.api.category1=Categoria
+wiki.api.image1=Immagine
+wiki.api.template1=Template
+wiki.api.category2=Category
+wiki.api.image2=File
+wiki.api.template2=Template
\ No newline at end of file
diff --git a/resources/Messages_pt_BR.properties b/resources/Messages_pt_BR.properties
new file mode 100644
index 0000000..e0baaf7
--- /dev/null
+++ b/resources/Messages_pt_BR.properties
@@ -0,0 +1,38 @@
+#Generated by ResourceBundle Editor (http://eclipse-rbe.sourceforge.net)
+
+wiki.api.category1 = Categoria
+wiki.api.category2 = Categoria
+wiki.api.categorytalk1 = Categoria_falar
+wiki.api.categorytalk2 = Categoria_falar
+wiki.api.help1 = Ajuda
+wiki.api.help2 = Ajuda
+wiki.api.helptalk1 = Ajuda_falar
+wiki.api.helptalk2 = Ajuda_falar
+wiki.api.image1 = Imagem
+wiki.api.image2 = Arquivo
+wiki.api.imagetalk1 = Imagem_falar
+wiki.api.imagetalk2 = Arquivo_falar
+wiki.api.media1 = M\u00EDdia
+wiki.api.media2 = M\u00EDdia
+wiki.api.mediawiki1 = MediaWiki
+wiki.api.mediawiki2 = MediaWiki
+wiki.api.mediawikitalk1 = MediaWiki_falar
+wiki.api.mediawikitalk2 = MediaWiki_falar
+wiki.api.meta1 = Meta
+wiki.api.meta2 = Meta
+wiki.api.metatalk1 = Meta_falar
+wiki.api.metatalk2 = Meta_falar
+wiki.api.special1 = Especial
+wiki.api.special2 = Especial
+wiki.api.talk1 = Falar
+wiki.api.talk2 = Falar
+wiki.api.template1 = Modelo
+wiki.api.template2 = Modelo
+wiki.api.templatetalk1 = Modelo_falar
+wiki.api.templatetalk2 = Modelo_falar
+wiki.api.url = http://br.wikipedia.org/w/api.php
+wiki.api.user1 = Usu\u00E1rio
+wiki.api.user2 = Usu\u00E1rio
+wiki.api.usertalk1 = Usu\u00E1rio_falar
+wiki.api.usertalk2 = Usu\u00E1rio_falar
+wiki.tags.toc.content = Conte\u00FAdo
diff --git a/resources/interwiki.properties b/resources/interwiki.properties
new file mode 100644
index 0000000..b312b1e
--- /dev/null
+++ b/resources/interwiki.properties
@@ -0,0 +1,392 @@
+be-x-old=http://be-x-old.wikipedia.org/wiki/${title}
+tavi=http://tavi.sourceforge.net/${title}
+xh=http://xh.wikipedia.org/wiki/${title}
+lasvegaswiki=http://wiki.gmnow.com/index.php/${title}
+pmeg=http://www.bertilow.com/pmeg/${title}.php
+warpedview=http://www.warpedview.com/index.php/${title}
+slashdot=http://slashdot.org/article.pl?sid=${title}
+wikimedia=http://wikimediafoundation.org/wiki/${title}
+wikia=http://www.wikia.com/wiki/index.php/${title}
+wo=http://wo.wikipedia.org/wiki/${title}
+jefo=http://www.esperanto-jeunes.org/vikio/index.php?${title}
+openfacts=http://openfacts.berlios.de/index.phtml?title=${title}
+lqwiki=http://wiki.linuxquestions.org/wiki/${title}
+wa=http://wa.wikipedia.org/wiki/${title}
+ciscavate=http://ciscavate.org/index.php/${title}
+demokraatia=http://wiki.demokraatia.ee/
+efnetpythonwiki=http://purl.net/wiki/python/${title}
+mediazilla=http://bugzilla.wikipedia.org/${title}
+wikiquote=http://en.wikiquote.org/wiki/${title}
+jbo=http://jbo.wikipedia.org/wiki/${title}
+vo=http://vo.wikipedia.org/wiki/${title}
+vi=http://vi.wikipedia.org/wiki/${title}
+gamewiki=http://gamewiki.org/wiki/index.php/${title}
+hewikisource=http://he.wikisource.org/wiki/${title}
+ve=http://ve.wikipedia.org/wiki/${title}
+google=http://www.google.com/search?q=${title}
+uz=http://uz.wikipedia.org/wiki/${title}
+drumcorpswiki=http://www.drumcorpswiki.com/index.php/${title}
+nah=http://nah.wikipedia.org/wiki/${title}
+ur=http://ur.wikipedia.org/wiki/${title}
+jiniwiki=http://www.cdegroot.com/cgi-bin/jini?${title}
+uk=http://uk.wikipedia.org/wiki/${title}
+ug=http://ug.wikipedia.org/wiki/${title}
+osi=reference model=http://wiki.tigma.ee/
+mbtest=http://www.usemod.com/cgi-bin/mbtest.pl?${title}
+disinfopedia=http://www.disinfopedia.org/wiki.phtml?title=${title}
+ty=http://ty.wikipedia.org/wiki/${title}
+squeak=http://minnow.cc.gatech.edu/squeak/${title}
+tw=http://tw.wikipedia.org/wiki/${title}
+tlh=http://tlh.wikipedia.org/wiki/${title}
+tt=http://tt.wikipedia.org/wiki/${title}
+ts=http://ts.wikipedia.org/wiki/${title}
+tr=http://tr.wikipedia.org/wiki/${title}
+scoutpedia=http://www.scoutpedia.info/index.php/${title}
+minnan=http://zh-min-nan.wikipedia.org/wiki/${title}
+to=http://to.wikipedia.org/wiki/${title}
+tn=http://tn.wikipedia.org/wiki/${title}
+wikinfo=http://www.wikinfo.org/wiki.php?title=${title}
+s23wiki=http://is-root.de/wiki/index.php/${title}
+tl=http://tl.wikipedia.org/wiki/${title}
+aiwiki=http://www.ifi.unizh.ch/ailab/aiwiki/aiw.cgi?${title}
+tk=http://tk.wikipedia.org/wiki/${title}
+ti=http://ti.wikipedia.org/wiki/${title}
+th=http://th.wikipedia.org/wiki/${title}
+tg=http://tg.wikipedia.org/wiki/${title}
+fr.fr=http://fr.fr.wikinations.org/${title}
+te=http://te.wikipedia.org/wiki/${title}
+csb=http://csb.wikipedia.org/wiki/${title}
+theopedia=http://www.theopedia.com/${title}
+ta=http://ta.wikipedia.org/wiki/${title}
+acadwiki=http://xarch.tu-graz.ac.at/autocad/wiki/${title}
+efnetceewiki=http://purl.net/wiki/c/${title}
+phpwiki=http://phpwiki.sourceforge.net/phpwiki/index.php?${title}
+tmwiki=http://www.EasyTopicMaps.com/?page=${title}
+sw=http://sw.wikipedia.org/wiki/${title}
+benefitswiki=http://www.benefitslink.com/cgi-bin/wiki.cgi?${title}
+ecxei=http://www.ikso.net/cgi-bin/wiki.pl?${title}
+sv=http://sv.wikipedia.org/wiki/${title}
+uea=http://www.tejo.org/uea/${title}
+su=http://su.wikipedia.org/wiki/${title}
+st=http://st.wikipedia.org/wiki/${title}
+ss=http://ss.wikipedia.org/wiki/${title}
+sr=http://sr.wikipedia.org/wiki/${title}
+sq=http://sq.wikipedia.org/wiki/${title}
+so=http://so.wikipedia.org/wiki/${title}
+sn=http://sn.wikipedia.org/wiki/${title}
+sm=http://sm.wikipedia.org/wiki/${title}
+sl=http://sl.wikipedia.org/wiki/${title}
+sk=http://sk.wikipedia.org/wiki/${title}
+cache=http://www.google.com/search?q=cache:${title}
+svgwiki=http://www.protocol7.com/svg-wiki/default.asp?${title}
+si=http://si.wikipedia.org/wiki/${title}
+smikipedia=http://www.smikipedia.org/${title}
+simple=http://simple.wikipedia.org/wiki/${title}
+sh=http://sh.wikipedia.org/wiki/${title}
+sg=http://sg.wikipedia.org/wiki/${title}
+gentoo-wiki=http://gentoo-wiki.com/${title}
+se=http://se.wikipedia.org/wiki/${title}
+webseitzwiki=http://webseitz.fluxent.com/wiki/${title}
+sd=http://sd.wikipedia.org/wiki/${title}
+sc=http://sc.wikipedia.org/wiki/${title}
+jamwiki=http://jamwiki.org/wiki/en/${title}
+sa=http://sa.wikipedia.org/wiki/${title}
+greencheese=http://www.greencheese.org/${title}
+linuxwiki=http://www.linuxwiki.de/${title}
+diveintoosx=http://diveintoosx.org/${title}
+bridgeswiki=http://c2.com/w2/bridges/${title}
+rw=http://rw.wikipedia.org/wiki/${title}
+ru=http://ru.wikipedia.org/wiki/${title}
+corpknowpedia=http://corpknowpedia.org/wiki/index.php/${title}
+echei=http://www.ikso.net/cgi-bin/wiki.pl?${title}
+ro=http://ro.wikipedia.org/wiki/${title}
+rn=http://rn.wikipedia.org/wiki/${title}
+rm=http://rm.wikipedia.org/wiki/${title}
+wikispecies=http://species.wikipedia.org/wiki/${title}
+webdevwikinl=http://www.promo-it.nl/WebDevWiki/index.php?page=${title}
+sourceforge=http://sourceforge.net/${title}
+pythonwiki=http://www.pythonwiki.de/${title}
+roa-rup=http://roa-rup.wikipedia.org/wiki/${title}
+tmnet=http://www.technomanifestos.net/?${title}
+gmailwiki=http://www.gmailwiki.com/index.php/${title}
+plog4u=http://plog4u.org/index.php/${title}
+googlegroups=http://groups.google.com/groups?q=${title}
+wikiworld=http://WikiWorld.com/wiki/index.php/${title}
+qu=http://qu.wikipedia.org/wiki/${title}
+consciousness=http://teadvus.inspiral.org/
+eljwiki=http://elj.sourceforge.net/phpwiki/index.php/${title}
+lojban=http://www.lojban.org/tiki/tiki-index.php?page=${title}
+usej=http://www.tejo.org/usej/${title}
+tokipona=http://tokipona.wikipedia.org/wiki/${title}
+mathsongswiki=http://SeedWiki.com/page.cfm?wikiid=237&doc=${title}
+got=http://got.wikipedia.org/wiki/${title}
+shakti=http://cgi.algonet.se/htbin/cgiwrap/pgd/ShaktiWiki/${title}
+memoryalpha=http://www.memory-alpha.org/en/index.php/${title}
+cliki=http://ww.telent.net/cliki/${title}
+pt=http://pt.wikipedia.org/wiki/${title}
+fr.ca=http://fr.ca.wikinations.org/${title}
+ps=http://ps.wikipedia.org/wiki/${title}
+fur=http://fur.wikipedia.org/wiki/${title}
+wikicities=http://www.wikicities.com/index.php/${title}
+pl=http://pl.wikipedia.org/wiki/${title}
+pi=http://pi.wikipedia.org/wiki/${title}
+wiktionary=http://en.wiktionary.org/wiki/${title}
+turismo=http://www.tejo.org/turismo/${title}
+pa=http://pa.wikipedia.org/wiki/${title}
+terrorwiki=http://www.liberalsagainstterrorism.com/wiki/index.php/${title}
+finalempire=http://final-empire.sourceforge.net/cgi-bin/wiki.pl?${title}
+fr.be=http://fr.wikinations.be/${title}
+os=http://os.wikipedia.org/wiki/${title}
+or=http://or.wikipedia.org/wiki/${title}
+netvillage=http://www.netbros.com/?${title}
+seattlewireless=http://seattlewireless.net/?${title}
+om=http://om.wikipedia.org/wiki/${title}
+pangalacticorg=http://www.pangalactic.org/Wiki/${title}
+seeds=http://www.IslandSeeds.org/wiki/${title}
+oc=http://oc.wikipedia.org/wiki/${title}
+raec=http://www.raec.clacso.edu.ar:8080/raec/Members/raecpedia/${title}
+ny=http://ny.wikipedia.org/wiki/${title}
+nv=http://nv.wikipedia.org/wiki/${title}
+foldoc=http://www.foldoc.org/foldoc/foldoc.cgi?${title}
+no=http://no.wikipedia.org/wiki/${title}
+nn=http://nn.wikipedia.org/wiki/${title}
+metawikipedia=http://meta.wikimedia.org/wiki/${title}
+wikif1=http://www.wikif1.org/${title}
+nl=http://nl.wikipedia.org/wiki/${title}
+ypsieyeball=http://sknkwrks.dyndns.org:1957/writewiki/wiki.pl?${title}
+ng=http://ng.wikipedia.org/wiki/${title}
+purlnet=http://purl.oclc.org/NET/${title}
+ne=http://ne.wikipedia.org/wiki/${title}
+nb=http://nb.wikipedia.org/wiki/${title}
+abbenormal=http://www.ourpla.net/cgi-bin/pikie.cgi?${title}
+na=http://na.wikipedia.org/wiki/${title}
+docbook=http://docbook.org/wiki/moin.cgi/${title}
+fr.org=http://fr.wikinations.org/${title}
+my=http://my.wikipedia.org/wiki/${title}
+brasilwiki=http://rio.ifi.unizh.ch/brasilienwiki/index.php/${title}
+mt=http://mt.wikipedia.org/wiki/${title}
+ms=http://ms.wikipedia.org/wiki/${title}
+mr=http://mr.wikipedia.org/wiki/${title}
+advogato=http://www.advogato.org/${title}
+senseislibrary=http://senseis.xmp.net/?${title}
+mo=http://mo.wikipedia.org/wiki/${title}
+mn=http://mn.wikipedia.org/wiki/${title}
+lutherwiki=http://www.lutheranarchives.com/mw/index.php/${title}
+ml=http://ml.wikipedia.org/wiki/${title}
+mk=http://mk.wikipedia.org/wiki/${title}
+mi=http://mi.wikipedia.org/wiki/${title}
+jspwiki=http://www.ecyrd.com/JSPWiki/Wiki.jsp?page=${title}
+mh=http://mh.wikipedia.org/wiki/${title}
+mg=http://mg.wikipedia.org/wiki/${title}
+metaweb=http://www.metaweb.com/wiki/wiki.phtml?title=${title}
+kmwiki=http://www.voght.com/cgi-bin/pywiki?${title}
+efnetxmlwiki=http://purl.net/wiki/xml/${title}
+tejo=http://www.tejo.org/vikio/${title}
+zwiki=http://www.zwiki.org/${title}
+lv=http://lv.wikipedia.org/wiki/${title}
+lt=http://lt.wikipedia.org/wiki/${title}
+lo=http://lo.wikipedia.org/wiki/${title}
+foxwiki=http://fox.wikis.com/wc.dll?Wiki~${title}
+ln=http://ln.wikipedia.org/wiki/${title}
+emacswiki=http://www.emacswiki.org/cgi-bin/wiki.pl?${title}
+li=http://li.wikipedia.org/wiki/${title}
+bemi=http://bemi.free.fr/vikio/index.php?${title}
+lg=http://lg.wikipedia.org/wiki/${title}
+wikibooks=http://en.wikibooks.org/wiki/${title}
+lb=http://lb.wikipedia.org/wiki/${title}
+la=http://la.wikipedia.org/wiki/${title}
+creationmatters=http://www.ourpla.net/cgi-bin/wiki.pl?${title}
+ky=http://ky.wikipedia.org/wiki/${title}
+kw=http://kw.wikipedia.org/wiki/${title}
+kv=http://kv.wikipedia.org/wiki/${title}
+pikie=http://pikie.darktech.org/cgi/pikie?${title}
+evowiki=http://www.evowiki.org/index.php/${title}
+ku=http://ku.wikipedia.org/wiki/${title}
+ks=http://ks.wikipedia.org/wiki/${title}
+kr=http://kr.wikipedia.org/wiki/${title}
+haribeau=http://wiki.haribeau.de/cgi-bin/wiki.pl?${title}
+ko=http://ko.wikipedia.org/wiki/${title}
+kn=http://kn.wikipedia.org/wiki/${title}
+km=http://km.wikipedia.org/wiki/${title}
+kl=http://kl.wikipedia.org/wiki/${title}
+kk=http://kk.wikipedia.org/wiki/${title}
+kj=http://kj.wikipedia.org/wiki/${title}
+ki=http://ki.wikipedia.org/wiki/${title}
+why=http://clublet.com/c/c/why?${title}
+kg=http://kg.wikipedia.org/wiki/${title}
+ka=http://ka.wikipedia.org/wiki/${title}
+mus=http://mus.wikipedia.org/wiki/${title}
+hrwiki=http://www.hrwiki.org/index.php/${title}
+orgpatterns=http://www.bell-labs.com/cgi-user/OrgPatterns/OrgPatterns?${title}
+jv=http://jv.wikipedia.org/wiki/${title}
+gotamac=http://www.got-a-mac.org/${title}
+dolphinwiki=http://www.object-arts.com/wiki/html/Dolphin/${title}
+zh-cn=http://zh.wikipedia.org/wiki/${title}
+visualworks=http://wiki.cs.uiuc.edu/VisualWorks/${title}
+iawiki=http://www.IAwiki.net/${title}
+freebsdman=http://www.FreeBSD.org/cgi/man.cgi?apropos=1&query=${title}
+ja=http://ja.wikipedia.org/wiki/${title}
+chy=http://chy.wikipedia.org/wiki/${title}
+unreal=http://wiki.beyondunreal.com/wiki/${title}
+iu=http://iu.wikipedia.org/wiki/${title}
+it=http://it.wikipedia.org/wiki/${title}
+is=http://is.wikipedia.org/wiki/${title}
+chr=http://chr.wikipedia.org/wiki/${title}
+usemod=http://www.usemod.com/cgi-bin/wiki.pl?${title}
+cmwiki=http://www.ourpla.net/cgi-bin/wiki.pl?${title}
+hammondwiki=http://www.dairiki.org/HammondWiki/index.php3?${title}
+cho=http://cho.wikipedia.org/wiki/${title}
+io=http://io.wikipedia.org/wiki/${title}
+personaltelco=http://www.personaltelco.net/index.cgi/${title}
+ik=http://ik.wikipedia.org/wiki/${title}
+haw=http://haw.wikipedia.org/wiki/${title}
+ii=http://ii.wikipedia.org/wiki/${title}
+wikisource=http://sources.wikipedia.org/wiki/${title}
+lugkr=http://lug-kr.sourceforge.net/cgi-bin/lugwiki.pl?${title}
+ig=http://ig.wikipedia.org/wiki/${title}
+zh-cfr=http://zh-min-nan.wikipedia.org/wiki/${title}
+ie=http://ie.wikipedia.org/wiki/${title}
+id=http://id.wikipedia.org/wiki/${title}
+ia=http://ia.wikipedia.org/wiki/${title}
+openwiki=http://openwiki.com/?${title}
+hz=http://hz.wikipedia.org/wiki/${title}
+hy=http://hy.wikipedia.org/wiki/${title}
+strikiwiki=http://ch.twi.tudelft.nl/~mostert/striki/teststriki.pl?${title}
+hu=http://hu.wikipedia.org/wiki/${title}
+herzkinderwiki=http://www.herzkinderinfo.de/Mediawiki/index.php/${title}
+ht=http://ht.wikipedia.org/wiki/${title}
+hr=http://hr.wikipedia.org/wiki/${title}
+webisodes=http://www.webisodes.org/${title}
+globalvoices=http://cyber.law.harvard.edu/dyn/globalvoices/wiki/${title}
+ho=http://ho.wikipedia.org/wiki/${title}
+hi=http://hi.wikipedia.org/wiki/${title}
+elibre=http://enciclopedia.us.es/index.php/${title}
+alife=http://news.alife.org/wiki/index.php?${title}
+he=http://he.wikipedia.org/wiki/${title}
+ast=http://ast.wikipedia.org/wiki/${title}
+ha=http://ha.wikipedia.org/wiki/${title}
+revo=http://purl.org/NET/voko/revo/art/${title}.html
+arxiv=http://www.arxiv.org/abs/${title}
+sockwiki=http://wiki.socklabs.com/${title}
+gv=http://gv.wikipedia.org/wiki/${title}
+gu=http://gu.wikipedia.org/wiki/${title}
+gn=http://gn.wikipedia.org/wiki/${title}
+gl=http://gl.wikipedia.org/wiki/${title}
+seapig=http://www.seapig.org/${title}
+gd=http://gd.wikipedia.org/wiki/${title}
+ga=http://ga.wikipedia.org/wiki/${title}
+opera7wiki=http://nontroppo.org/wiki/${title}
+oeis=http://www.research.att.com/cgi-bin/access.cgi/as/njas/sequences/eisA.cgi?Anum=${title}
+moinmoin=http://purl.net/wiki/moin/${title}
+fy=http://fy.wikipedia.org/wiki/${title}
+gej=http://www.esperanto.de/cgi-bin/aktivikio/wiki.pl?${title}
+fr=http://fr.wikipedia.org/wiki/${title}
+arc=http://arc.wikipedia.org/wiki/${title}
+fo=http://fo.wikipedia.org/wiki/${title}
+fj=http://fj.wikipedia.org/wiki/${title}
+wikinews=http://en.wikinews.org/wiki/${title}
+fi=http://fi.wikipedia.org/wiki/${title}
+ff=http://ff.wikipedia.org/wiki/${title}
+annotationwiki=http://www.seedwiki.com/page.cfm?wikiid=368&doc=${title}
+sep11=http://sep11.wikipedia.org/wiki/${title}
+wlug=http://www.wlug.org.nz/${title}
+fa=http://fa.wikipedia.org/wiki/${title}
+eu=http://eu.wikipedia.org/wiki/${title}
+tmbw=http://www.tmbw.net/wiki/index.php/${title}
+et=http://et.wikipedia.org/wiki/${title}
+scn=http://scn.wikipedia.org/wiki/${title}
+es=http://es.wikipedia.org/wiki/${title}
+muweb=http://www.dunstable.com/scripts/MuWebWeb?${title}
+eo=http://eo.wikipedia.org/wiki/${title}
+en=http://en.wikipedia.org/wiki/${title}
+dejanews=http://www.deja.com/=dnc/getdoc.xp?AN=${title}
+el=http://el.wikipedia.org/wiki/${title}
+jargonfile=http://sunir.org/apps/meta.pl?wiki=JargonFile&redirect=${title}
+eokulturcentro=http://esperanto.toulouse.free.fr/wakka.php?wiki=${title}
+ee=http://ee.wikipedia.org/wiki/${title}
+tum=http://tum.wikipedia.org/wiki/${title}
+plog4u_de=http://plog4u.de/index.php/${title}
+dz=http://dz.wikipedia.org/wiki/${title}
+dv=http://dv.wikipedia.org/wiki/${title}
+kerimwiki=http://wiki.oxus.net/${title}
+dk=http://da.wikipedia.org/wiki/${title}
+de=http://de.wikipedia.org/wiki/${title}
+dwjwiki=http://www.suberic.net/cgi-bin/dwj/wiki.cgi?${title}
+da=http://da.wikipedia.org/wiki/${title}
+wlwiki=http://winslowslair.supremepixels.net/wiki/index.php/${title}
+cy=http://cy.wikipedia.org/wiki/${title}
+w=http://en.wikipedia.org/wiki/${title}
+cv=http://cv.wikipedia.org/wiki/${title}
+cs=http://cs.wikipedia.org/wiki/${title}
+cr=http://cr.wikipedia.org/wiki/${title}
+q=http://en.wikiquote.org/wiki/${title}
+co=http://co.wikipedia.org/wiki/${title}
+zh-min-nan=http://zh-min-nan.wikipedia.org/wiki/${title}
+n=http://en.wikinews.org/wiki/${title}
+m=http://meta.wikimedia.org/wiki/${title}
+annotation=http://bayle.stanford.edu/crit/nph-med.cgi/${title}
+ch=http://ch.wikipedia.org/wiki/${title}
+efnetcppwiki=http://purl.net/wiki/cpp/${title}
+ce=http://ce.wikipedia.org/wiki/${title}
+c2find=http://c2.com/cgi/wiki?FindPage&value=${title}
+b=http://en.wikibooks.org/wiki/${title}
+ca=http://ca.wikipedia.org/wiki/${title}
+dictionary=http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=${title}
+ang=http://ang.wikipedia.org/wiki/${title}
+zh-tw=http://zh.wikipedia.org/wiki/${title}
+bs=http://bs.wikipedia.org/wiki/${title}
+br=http://br.wikipedia.org/wiki/${title}
+twiki=http://twiki.org/cgi-bin/view/${title}
+bo=http://bo.wikipedia.org/wiki/${title}
+wikt=http://en.wiktionary.org/wiki/${title}
+bn=http://bn.wikipedia.org/wiki/${title}
+bm=http://bm.wikipedia.org/wiki/${title}
+bi=http://bi.wikipedia.org/wiki/${title}
+bh=http://bh.wikipedia.org/wiki/${title}
+bg=http://bg.wikipedia.org/wiki/${title}
+knowhow=http://www2.iro.umontreal.ca/~paquetse/cgi-bin/wiki.cgi?${title}
+be=http://be.wikipedia.org/wiki/${title}
+wiki=http://c2.com/cgi/wiki?${title}
+patwiki=http://gauss.ffii.org/${title}
+ba=http://ba.wikipedia.org/wiki/${title}
+rfc=http://www.rfc-editor.org/rfc/rfc${title}.txt
+zu=http://zu.wikipedia.org/wiki/${title}
+lanifexwiki=http://opt.lanifex.com/cgi-bin/wiki.pl?${title}
+twistedwiki=http://purl.net/wiki/twisted/${title}
+az=http://az.wikipedia.org/wiki/${title}
+ay=http://ay.wikipedia.org/wiki/${title}
+commons=http://commons.wikimedia.org/wiki/${title}
+acronym=http://www.acronymfinder.com/af-query.asp?String=exact&Acronym=${title}
+av=http://av.wikipedia.org/wiki/${title}
+aspienetwiki=http://aspie.mela.de/Wiki/index.php?title=${title}
+as=http://as.wikipedia.org/wiki/${title}
+metawiki=http://sunir.org/apps/meta.pl?${title}
+ar=http://ar.wikipedia.org/wiki/${title}
+zh=http://zh.wikipedia.org/wiki/${title}
+pywiki=http://www.voght.com/cgi-bin/pywiki?${title}
+an=http://an.wikipedia.org/wiki/${title}
+am=http://am.wikipedia.org/wiki/${title}
+ak=http://ak.wikipedia.org/wiki/${title}
+infosecpedia=http://www.infosecpedia.org/pedia/index.php/${title}
+za=http://za.wikipedia.org/wiki/${title}
+af=http://af.wikipedia.org/wiki/${title}
+firstwiki=http://firstwiki.org/index.php/${title}
+als=http://als.wikipedia.org/wiki/${title}
+ab=http://ab.wikipedia.org/wiki/${title}
+aa=http://aa.wikipedia.org/wiki/${title}
+ursine=http://ursine.ca/${title}
+meatball=http://www.usemod.com/cgi-bin/mb.pl?${title}
+mozillawiki=http://wiki.mozilla.org/index.php/${title}
+imdb=http://us.imdb.com/Title?${title}
+pythoninfo=http://www.python.org/cgi-bin/moinmoin/${title}
+yo=http://yo.wikipedia.org/wiki/${title}
+seattlewiki=http://seattlewiki.org/wiki/${title}
+yi=http://yi.wikipedia.org/wiki/${title}
+vls=http://vls.wikipedia.org/wiki/${title}
+meta=http://meta.wikimedia.org/wiki/${title}
+susning=http://www.susning.nu/${title}
+nds=http://nds.wikipedia.org/wiki/${title}
+wikitravel=http://wikitravel.org/en/${title}
+codersbase=http://www.codersbase.com/${title}
+tpi=http://tpi.wikipedia.org/wiki/${title}
+ppr=http://c2.com/cgi/wiki?${title}
\ No newline at end of file
diff --git a/resources/operators.txt b/resources/operators.txt
new file mode 100644
index 0000000..7d9835d
--- /dev/null
+++ b/resources/operators.txt
@@ -0,0 +1,27 @@
+pre,-,PreMinus,4600
+pre,+,PrePlus,4600
+pre,not,Not,4600
+#
+in,^,Pow,3700
+#
+in,*,Times,3800
+in,/,Divide,3800
+in,div,Divide,3800
+in,mod,Mod,3800
+#
+in,+,Plus,2900
+in,-,Subtract,2900
+#
+in,round,Round,2800
+#
+in,=,Equal,2600
+in,!=,Unequal,2600
+in,<>,Unequal,2600
+in,>,Greater,2600
+in,>=,GreaterEqual,2600
+in,<,Less,2600
+in,<=,LessEqual,2600
+#
+in,and,And,2000
+#
+in,or,Or,1900
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala
index d29aaa0..b4012e9 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala
@@ -17,13 +17,13 @@ case class ConllDoc(val docID: String,
val trees: Seq[DepConstTree],
val nerChunks: Seq[Seq[Chunk[String]]],
val corefChunks: Seq[Seq[Chunk[Int]]],
- val speakers: Seq[Seq[String]]) {
+ val speakers: Seq[Seq[String]]) extends Document {
- val numSents = words.size;
+ override val numSents = words.size;
- def uid = docID -> docPartNo;
+ override def uid = docID -> docPartNo;
- def fileName = {
+ override def fileName = {
if (docID.contains("/")) {
docID.substring(docID.lastIndexOf("/") + 1);
} else {
@@ -31,11 +31,11 @@ case class ConllDoc(val docID: String,
}
}
- def printableDocName = docID + " (part " + docPartNo + ")";
+ override def printableDocName = docID + " (part " + docPartNo + ")";
- def isConversation = docID.startsWith("bc") || docID.startsWith("wb");
-
- def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = ConllDoc.getCorrespondingNERChunk(nerChunks(sentIdx), headIdx);
+ override def isConversation = docID.startsWith("bc") || docID.startsWith("wb")
+
+ override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = ConllDoc.getCorrespondingNERChunk(nerChunks(sentIdx), headIdx);
}
object ConllDoc {
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
index 91685f3..299fe02 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
@@ -38,7 +38,7 @@ class ConllDocReader(val lang: Language,
case _ => throw new RuntimeException("Bad language, no head finder for " + lang);
}
- def readConllDocs(fileName: String): Seq[ConllDoc] = {
+ def readConllDocs(fileName: String): Seq[Document] = {
val fcn = (docID: String, docPartNo: Int, docBySentencesByLines: ArrayBuffer[ArrayBuffer[String]]) => assembleConllDoc(docBySentencesByLines, docID, docPartNo);
ConllDocReader.readConllDocsGeneral(fileName, fcn);
}
@@ -283,7 +283,7 @@ object ConllDocReader {
// loadRawConllDocsWithSuffix(path, size, if (gold) "gold_conll" else "auto_conll", lang, betterParsesFile);
// }
- def loadRawConllDocsWithSuffix(path: String, size: Int, suffix: String, lang: Language = Language.ENGLISH, betterParsesFile: String = ""): Seq[ConllDoc] = {
+ def loadRawConllDocsWithSuffix(path: String, size: Int, suffix: String, lang: Language = Language.ENGLISH, betterParsesFile: String = ""): Seq[Document] = {
Logger.logss("Loading " + size + " docs from " + path + " ending with " + suffix);
val rawDir = new File(path);
if (!rawDir.exists() || !rawDir.canRead() || rawDir.listFiles == null || rawDir.listFiles.isEmpty) {
@@ -292,13 +292,22 @@ object ConllDocReader {
val rawFiles = rawDir.listFiles.sortBy(_.getAbsolutePath());
val files = rawFiles.filter(file => file.getAbsolutePath.endsWith(suffix));
val reader = new ConllDocReader(lang, betterParsesFile);
- val docs = new ArrayBuffer[ConllDoc];
+ val docs = new ArrayBuffer[Document];
var docCounter = 0;
var fileIdx = 0;
while (fileIdx < files.size && (size == -1 || docCounter < size)) {
- val newDocs = reader.readConllDocs(files(fileIdx).getAbsolutePath);
- docs ++= newDocs;
- docCounter += newDocs.size
+ val pp = files(fileIdx).getAbsolutePath
+ try {
+ Logger.logss("Loading doc: " + pp)
+ val newDocs = reader.readConllDocs(pp);
+ docs ++= newDocs;
+ docCounter += newDocs.size
+ } catch {
+ case e : Exception => {
+ Logger.logss("failed document "+pp)
+ e.printStackTrace(System.err)
+ }
+ }
fileIdx += 1;
}
val numDocs = if (size == -1) docs.size else Math.min(size, files.size);
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala
index 395a268..422a694 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala
@@ -16,7 +16,7 @@ import edu.berkeley.nlp.entity.wiki.WikiAnnotReaderWriter
object ConllDocWriter {
- def writeDoc(writer: PrintWriter, conllDoc: ConllDoc, clustering: OrderedClusteringBound) {
+ def writeDoc(writer: PrintWriter, conllDoc: Document, clustering: OrderedClusteringBound) {
writeIncompleteConllDoc(writer, conllDoc.docID, conllDoc.docPartNo, conllDoc.words, conllDoc.pos, conllDoc.trees.map(_.constTree), conllDoc.speakers, conllDoc.nerChunks, convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size));
// val corefBits = getCorefBits(conllDoc.words.map(_.size), convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size));
// val numZeroesToAddToPartNo = 3 - conllDoc.docPartNo.toString.size;
@@ -35,7 +35,7 @@ object ConllDocWriter {
}
def writeDocWithPredAnnotations(writer: PrintWriter,
- conllDoc: ConllDoc,
+ conllDoc: Document,
nerChunks: Seq[Seq[Chunk[String]]],
corefClustering: OrderedClusteringBound,
wikiChunks: Option[Seq[Seq[Chunk[String]]]] = None) {
@@ -45,7 +45,7 @@ object ConllDocWriter {
def writeDocWithPredAnnotationsWikiStandoff(writer: PrintWriter,
standoffWriter: PrintWriter,
- conllDoc: ConllDoc,
+ conllDoc: Document,
nerChunks: Seq[Seq[Chunk[String]]],
corefClustering: OrderedClusteringBound,
wikiChunks: Seq[Seq[Chunk[String]]]) {
@@ -54,7 +54,7 @@ object ConllDocWriter {
}
def writeIncompleteConllDoc(writer: PrintWriter,
- doc: ConllDoc) {
+ doc: Document) {
writeIncompleteConllDocNestedNER(writer, doc.docID, doc.docPartNo, doc.words, doc.pos, doc.trees.map(_.constTree), doc.speakers, doc.nerChunks, doc.corefChunks);
}
@@ -210,7 +210,7 @@ object ConllDocWriter {
}
}
- def writeDocIllinoisColumnFormat(writer: PrintWriter, conllDoc: ConllDoc) {
+ def writeDocIllinoisColumnFormat(writer: PrintWriter, conllDoc: Document) {
writer.println("O\t0\t0\tO\t-X-\t-DOCSTART-\tx\tx\t0");
// B-LOC 0 0 I-NP NNP Portugal x x 0
diff --git a/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala b/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala
index 641cd4c..31a0d06 100644
--- a/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala
@@ -16,10 +16,11 @@ import java.util.Collections
import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder
import edu.berkeley.nlp.futile.ling.CollinsHeadFinder
+@SerialVersionUID(1L)
class DepConstTree(val constTree: Tree[String],
val pos: Seq[String],
val words: Seq[String],
- val childParentDepMap: HashMap[Int,Int]) {
+ val childParentDepMap: HashMap[Int,Int]) extends Serializable {
require(childParentDepMap.keys.toSeq.sorted.sameElements((0 until words.size)), PennTreeRenderer.render(constTree));
def size = words.size;
diff --git a/src/main/java/edu/berkeley/nlp/entity/Document.scala b/src/main/java/edu/berkeley/nlp/entity/Document.scala
new file mode 100644
index 0000000..cf95766
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/Document.scala
@@ -0,0 +1,36 @@
+package edu.berkeley.nlp.entity
+
+/**
+ * Created by matthew on 2/18/15.
+ */
+trait Document {
+ def docID : String
+ def docPartNo : Int
+ // arrays of words in each sentence including punc
+ def words : Seq[Seq[String]]
+ // the grammar types of the words
+ def pos : Seq[Seq[String]]
+ // parse trees of each sentence
+ def trees : Seq[DepConstTree]
+ // I am guessing the type of the chunk eg: ORG-NAM
+ def nerChunks : Seq[Seq[Chunk[String]]]
+ // have ranges and identifiers for the unique item that they are referencing
+ // appears [start, end)
+ def corefChunks : Seq[Seq[Chunk[Int]]]
+ // just use "-" for each in the case that the speaker is unknown
+ def speakers : Seq[Seq[String]]
+
+ def numSents : Int = -1
+
+ def uid : (String, Int) = docID -> docPartNo
+
+ def fileName : String
+
+ def printableDocName : String
+
+ def isConversation : Boolean = false
+
+ def getCorrespondingNERChunk (sentIdx : Int, headIdx : Int) : Option[Chunk[String]]
+
+ var documentVectorCache: Array[Int] = null
+}
diff --git a/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala b/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala
index 1fad8ce..2bf9fa3 100644
--- a/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala
@@ -153,7 +153,7 @@ object EntitySystem {
ConllDocReader.loadRawConllDocsWithSuffix(goldPath, size, goldSuffix));
} else {
(ConllDocReader.loadRawConllDocsWithSuffix(path, size, suffix),
- new ArrayBuffer[ConllDoc]());
+ new ArrayBuffer[Document]());
}
val goldWikification = new HashMap[String,HashMap[Int,ArrayBuffer[Chunk[String]]]];
val assembler = CorefDocAssembler(Driver.lang, Driver.useGoldMentions);
diff --git a/src/main/java/edu/berkeley/nlp/entity/GUtil.scala b/src/main/java/edu/berkeley/nlp/entity/GUtil.scala
index 803cd6d..8031560 100644
--- a/src/main/java/edu/berkeley/nlp/entity/GUtil.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/GUtil.scala
@@ -406,7 +406,7 @@ object GUtil {
def argMaxIdxFloat(values: Seq[Float]) = {
var currIdx = 0;
var maxIdx = 0;
- var maxVal = Double.NegativeInfinity;
+ var maxVal = Float.NegativeInfinity;
while (currIdx < values.size) {
if (values(currIdx) > maxVal) {
maxIdx = currIdx;
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
new file mode 100644
index 0000000..fc1ab62
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
@@ -0,0 +1,37 @@
+package edu.berkeley.nlp.entity
+
+/**
+ * Created by matthew on 2/18/15.
+ */
+@SerialVersionUID(1L)
+case class WikiDoc (docID : String,
+ docPartNo : Int,
+ words : Seq[Seq[String]],
+ pos : Seq[Seq[String]],
+ trees: Seq[DepConstTree],
+ nerChunks : Seq[Seq[Chunk[String]]],
+ corefChunks : Seq[Seq[Chunk[Int]]],
+ speakers : Seq[Seq[String]],
+ wikiRefChunks : Seq[Seq[Chunk[String]]] ) extends Document {
+
+ override val numSents = words.size;
+
+ override def uid = docID -> docPartNo;
+
+ override def fileName = {
+ if (docID.contains("/")) {
+ docID.substring(docID.lastIndexOf("/") + 1);
+ } else {
+ docID;
+ }
+ }
+
+ override def printableDocName = docID + " (part " + docPartNo + ")";
+
+ override def isConversation = docID.startsWith("bc") || docID.startsWith("wb")
+
+ override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = None;
+
+ //override def corefChunks = throw new NotImplementedError()
+
+}
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
new file mode 100644
index 0000000..2c2f6d8
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
@@ -0,0 +1,24 @@
+package edu.berkeley.nlp.entity
+
+import java.io.File
+
+import edu.berkeley.nlp.entity.lang.Language
+
+import scala.collection.mutable.ArrayBuffer
+
+/**
+ * Created by matthew on 2/18/15.
+ */
+class WikiDocReader (lang : Language, better : String) {} // TODO: remove
+
+object WikiDocReader {
+ def loadRawWikiDocs(path : String, size : Int, suffix : String, lang : Language = Language.ENGLISH, betterParsesFile : String = "") : Seq[Document] = {
+
+ var docs = GUtil.load(path).asInstanceOf[List[WikiDoc]]
+
+ if(size != -1 && docs.size > size)
+ docs.map(_.asInstanceOf[Document]).slice(0, size).toSeq
+ else
+ docs.map(_.asInstanceOf[Document]).toSeq
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala
index bfd8b14..ee9b457 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala
@@ -9,22 +9,22 @@ import scala.sys.process.stringSeqToProcess
import scala.sys.process.Process
import edu.berkeley.nlp.futile.util.Logger
import edu.berkeley.nlp.entity.Driver
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.ConllDocWriter
class CorefConllScorer(val conllEvalScriptPath: String) {
- def renderFinalScore(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound]) = {
+ def renderFinalScore(conllDocs: Seq[Document], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound]) = {
val summary = score(conllDocs, rawPredClusterings, goldClusterings, true);
CorefConllScorer.processConllString(summary, false);
}
- def renderSuffStats(conllDoc: ConllDoc, rawPredClustering: OrderedClusteringBound, goldClustering: OrderedClusteringBound) = {
+ def renderSuffStats(conllDoc: Document, rawPredClustering: OrderedClusteringBound, goldClustering: OrderedClusteringBound) = {
val summary = score(Seq(conllDoc), Seq(rawPredClustering), Seq(goldClustering), false);
CorefConllScorer.processConllString(summary, true);
}
- def score(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound], saveTempFiles: Boolean) = {
+ def score(conllDocs: Seq[Document], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound], saveTempFiles: Boolean) = {
val predClusterings = if (Driver.doConllPostprocessing) rawPredClusterings.map(_.postprocessForConll()) else rawPredClusterings;
// var predFile = File.createTempFile("temp", ".conll");
val (predFile, goldFile) = if (Driver.conllOutputDir != "" && saveTempFiles) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala
index f7cc4b6..f5634fb 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala
@@ -10,9 +10,9 @@ import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
import edu.berkeley.nlp.futile.util.Counter
import edu.berkeley.nlp.futile.util.Logger
import edu.berkeley.nlp.entity.GUtil
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
-case class CorefDoc(val rawDoc: ConllDoc,
+case class CorefDoc(val rawDoc: Document,
val goldMentions: Seq[Mention],
val goldClustering: OrderedClustering,
val predMentions: Seq[Mention]) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala
index 9c369e3..413e1cd 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala
@@ -12,17 +12,17 @@ import edu.berkeley.nlp.entity.lang.ChineseCorefLanguagePack
import edu.berkeley.nlp.entity.lang.ArabicCorefLanguagePack
import edu.berkeley.nlp.futile.util.Counter
import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
case class ProtoMention(val sentIdx: Int, val startIdx: Int, val endIdx: Int, val headIdx: Int);
case class ProtoMentionFancy(val sentIdx: Int, val startIdx: Int, val endIdx: Int, val headIndices: Seq[Int]);
-case class ProtoCorefDoc(val doc: ConllDoc, val goldMentions: Seq[Mention], val predProtoMentions: Seq[ProtoMention]);
+case class ProtoCorefDoc(val doc: Document, val goldMentions: Seq[Mention], val predProtoMentions: Seq[ProtoMention]);
class CorefDocAssembler(val langPack: CorefLanguagePack,
val useGoldMentions: Boolean) {
- def createCorefDoc(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = {
+ def createCorefDoc(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = {
val (goldMentions, goldClustering) = extractGoldMentions(rawDoc, propertyComputer);
if (goldMentions.size == 0) {
Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
@@ -31,7 +31,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions)
}
- def createCorefDocFancy(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, possibleChunks: Seq[Seq[Chunk[Boolean]]]): CorefDoc = {
+ def createCorefDocFancy(rawDoc: Document, propertyComputer: MentionPropertyComputer, possibleChunks: Seq[Seq[Chunk[Boolean]]]): CorefDoc = {
val (goldMentions, goldClustering) = extractGoldMentions(rawDoc, propertyComputer);
if (goldMentions.size == 0) {
Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
@@ -41,11 +41,11 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions)
}
- def extractGoldMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
+ def extractGoldMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
CorefDocAssembler.extractGoldMentions(rawDoc, propertyComputer, langPack);
}
- def extractPredMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
+ def extractPredMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
val protoMentionsSorted = getProtoMentionsSorted(rawDoc, gms);
val finalMentions = new ArrayBuffer[Mention]();
for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) {
@@ -54,7 +54,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
finalMentions;
}
- def extractPredMentionsFancy(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Mention] = {
+ def extractPredMentionsFancy(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Mention] = {
val protoMentionsSorted = getProtoMentionsSortedFancy(rawDoc, gms, possibleChunks);
val finalMentions = new ArrayBuffer[Mention]();
for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) {
@@ -63,7 +63,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
finalMentions;
}
- private def getProtoMentionsSorted(rawDoc: ConllDoc, gms: Seq[Mention]): Seq[Seq[ProtoMention]] = {
+ private def getProtoMentionsSorted(rawDoc: Document, gms: Seq[Mention]): Seq[Seq[ProtoMention]] = {
val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMention]);
for (sentIdx <- 0 until rawDoc.numSents) {
// Extract NE spans: filter out O, QUANTITY, CARDINAL, CHUNK
@@ -131,7 +131,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
// }
}
- private def getProtoMentionsSortedFancy(rawDoc: ConllDoc, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Seq[ProtoMention]] = {
+ private def getProtoMentionsSortedFancy(rawDoc: Document, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Seq[ProtoMention]] = {
val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMention]);
for (sentIdx <- 0 until rawDoc.numSents) {
// Extract NPs and PRPs *except* for those contained in NE chunks (the NE tagger seems more reliable than the parser)
@@ -154,7 +154,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
}
}
- private def filterNonMaximalNPs(rawDoc: ConllDoc, mentionExtents: Seq[HashSet[ProtoMention]]) = {
+ private def filterNonMaximalNPs(rawDoc: Document, mentionExtents: Seq[HashSet[ProtoMention]]) = {
val filteredProtoMentionsSorted = (0 until rawDoc.numSents).map(i => new ArrayBuffer[ProtoMention]);
for (sentIdx <- 0 until mentionExtents.size) {
val protoMentionsByHead = mentionExtents(sentIdx).groupBy(_.headIdx);
@@ -211,7 +211,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
//////////////////
- def createCorefDocWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = {
+ def createCorefDocWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = {
val (goldMentions, goldClustering) = extractGoldMentionsWithCoordination(rawDoc, propertyComputer);
if (goldMentions.size == 0) {
Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
@@ -220,7 +220,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions)
}
- def extractGoldMentionsWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
+ def extractGoldMentionsWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
val goldProtoMentionsSorted = getGoldProtoMentionsSortedWithCoordination(rawDoc);
val finalMentions = new ArrayBuffer[Mention]();
val goldClusterLabels = new ArrayBuffer[Int]();
@@ -238,7 +238,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
(finalMentions, OrderedClustering.createFromClusterIds(goldClusterLabels));
}
- def extractPredMentionsWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
+ def extractPredMentionsWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
val protoMentionsSorted = getProtoMentionsSortedWithCoordination(rawDoc, gms);
val finalMentions = new ArrayBuffer[Mention]();
for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) {
@@ -247,7 +247,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
finalMentions;
}
- private def getGoldProtoMentionsSortedWithCoordination(rawDoc: ConllDoc): Seq[Seq[ProtoMentionFancy]] = {
+ private def getGoldProtoMentionsSortedWithCoordination(rawDoc: Document): Seq[Seq[ProtoMentionFancy]] = {
val goldProtoMentions = for (sentIdx <- 0 until rawDoc.corefChunks.size) yield {
for (chunk <- rawDoc.corefChunks(sentIdx)) yield {
val headIndices = rawDoc.trees(sentIdx).getSpanHeadOrNPCoordinatedHeads(chunk.start, chunk.end);
@@ -257,7 +257,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
goldProtoMentions.map(_.sortBy(ment => (ment.sentIdx, ment.headIndices.head, ment.endIdx, ment.startIdx)));
}
- private def getProtoMentionsSortedWithCoordination(rawDoc: ConllDoc, gms: Seq[Mention]): Seq[Seq[ProtoMentionFancy]] = {
+ private def getProtoMentionsSortedWithCoordination(rawDoc: Document, gms: Seq[Mention]): Seq[Seq[ProtoMentionFancy]] = {
val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMentionFancy]);
for (sentIdx <- 0 until rawDoc.numSents) {
// Extract NE spans: filter out O, QUANTITY, CARDINAL, CHUNK
@@ -442,7 +442,7 @@ object CorefDocAssembler {
new CorefDocAssembler(langPack, useGoldMentions);
}
- def extractGoldMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, langPack: CorefLanguagePack): (Seq[Mention], OrderedClustering) = {
+ def extractGoldMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer, langPack: CorefLanguagePack): (Seq[Mention], OrderedClustering) = {
val goldProtoMentionsSorted = getGoldProtoMentionsSorted(rawDoc);
val finalMentions = new ArrayBuffer[Mention]();
val goldClusterLabels = new ArrayBuffer[Int]();
@@ -460,7 +460,7 @@ object CorefDocAssembler {
(finalMentions, OrderedClustering.createFromClusterIds(goldClusterLabels));
}
- def getGoldProtoMentionsSorted(rawDoc: ConllDoc): Seq[Seq[ProtoMention]] = {
+ def getGoldProtoMentionsSorted(rawDoc: Document): Seq[Seq[ProtoMention]] = {
val goldProtoMentions = for (sentIdx <- 0 until rawDoc.corefChunks.size) yield {
for (chunk <- rawDoc.corefChunks(sentIdx)) yield {
val headIdx = rawDoc.trees(sentIdx).getSpanHead(chunk.start, chunk.end);
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala
index cacd259..41a80e3 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala
@@ -5,13 +5,13 @@ import edu.berkeley.nlp.futile.util.Logger
import scala.collection.mutable.ArrayBuffer
import edu.berkeley.nlp.entity.wiki.ACEMunger
import java.io.File
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
class CorefDocAssemblerACE(dirPath: String) {
val langPack = new EnglishCorefLanguagePack()
- def createCorefDoc(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = {
+ def createCorefDoc(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = {
val (goldMentions, goldClustering) = CorefDocAssembler.extractGoldMentions(rawDoc, propertyComputer, langPack);
if (goldMentions.size == 0) {
Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala
index 208c342..85adc64 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala
@@ -39,7 +39,7 @@ import edu.berkeley.nlp.entity.xdistrib.DocumentGraphComponents
import edu.berkeley.nlp.futile.fig.exec.Execution
import edu.berkeley.nlp.entity.Driver
import edu.berkeley.nlp.entity.GUtil
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.WordNetInterfacer
import edu.berkeley.nlp.entity.ConllDocWriter
import edu.berkeley.nlp.entity.ConllDocReader
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
index 58b9cd4..c31144a 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
@@ -8,14 +8,15 @@ import edu.berkeley.nlp.entity.sem.SemClasser
import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
import edu.berkeley.nlp.futile.util.Counter
import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.Driver;
import edu.berkeley.nlp.entity.WordNetInterfacer
// TODO: Extract an interface for ConllDoc so I don't have to keep the whole
// document around...but while I'm feature engineering it's useful to be able
// to put my hands on anything I want
-class Mention(val rawDoc: ConllDoc,
+// ... ok settle down
+class Mention(val rawDoc: Document,
val mentIdx: Int,
val sentIdx: Int,
val startIdx: Int,
@@ -39,6 +40,16 @@ class Mention(val rawDoc: ConllDoc,
var cachedNerPossibilities: Option[Chunk[Counter[String]]] = None;
var cachedNerGold: Option[Chunk[String]] = None;
+ override def toString = {
+ var ret = "{"
+ if(startIdx > 1)
+ ret += rawDoc.words(sentIdx)(startIdx - 1) + " "
+ ret += "["+spanToString+"]"
+ if(endIdx < rawDoc.words(sentIdx).size-1)
+ ret += rawDoc.words(sentIdx)(endIdx+ 1)
+ ret + "}"
+ }
+
def speaker = rawDoc.speakers(sentIdx)(headIdx);
def headString = rawDoc.words(sentIdx)(headIdx);
@@ -247,7 +258,7 @@ object Mention {
val StartPosPlaceholder = "";
val EndPosPlaceholder = "";
- def createMentionComputeProperties(rawDoc: ConllDoc,
+ def createMentionComputeProperties(rawDoc: Document,
mentIdx: Int,
sentIdx: Int,
startIdx: Int,
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala b/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala
index 31c32f6..21b1ac7 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala
@@ -21,6 +21,8 @@ import edu.berkeley.nlp.entity.WordNetInterfacer
* DO NOT try to add WordNetInterfacer here! It is not serializable and so
* everything will explode when we try to serialize the model. So we choose
* to cache it on the documents even though this is pretty hacky.
+ *
+ * TODO: maybe change to using "transient" fields re:^^
*/
@SerialVersionUID(1L)
class PairwiseIndexingFeaturizerJoint(val featureIndexer: Indexer[String],
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala
index 512cc27..a78e96f 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala
@@ -15,10 +15,10 @@ import edu.berkeley.nlp.entity.Driver
import edu.berkeley.nlp.entity.ner.NerFeaturizer
import scala.collection.mutable.HashSet
import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.ner.NerPruner
-class JointDoc(val rawDoc: ConllDoc,
+class JointDoc(val rawDoc: Document,
val docGraph: DocumentGraph,
val goldNERChunks: Seq[Seq[Chunk[String]]],
val goldWikiChunks: Seq[Seq[Chunk[String]]]) {
@@ -71,7 +71,7 @@ class JointDoc(val rawDoc: ConllDoc,
object JointDoc {
- def apply(rawDoc: ConllDoc,
+ def apply(rawDoc: Document,
docGraph: DocumentGraph,
maybeGoldNERChunks: Option[Seq[Seq[Chunk[String]]]],
maybeGoldWikiChunks: Option[Seq[Seq[Chunk[String]]]]) = {
@@ -89,7 +89,7 @@ object JointDoc {
}
def assembleJointDocs(docGraphs: Seq[DocumentGraph],
- goldConllDocsForNER: Seq[ConllDoc],
+ goldConllDocsForNER: Seq[Document],
goldWikification: HashMap[String,HashMap[Int,ArrayBuffer[Chunk[String]]]]) = {
docGraphs.map(docGraph => {
val rawDoc = docGraph.corefDoc.rawDoc;
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala
index fc78b5e..85c9683 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala
@@ -5,13 +5,13 @@ import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.coref.DocumentGraph
import edu.berkeley.nlp.entity.coref.Mention
import edu.berkeley.nlp.entity.wiki._
import edu.berkeley.nlp.futile.util.Logger
-class JointDocACE(val rawDoc: ConllDoc,
+class JointDocACE(val rawDoc: Document,
val docGraph: DocumentGraph,
val goldWikiChunks: Seq[Seq[Chunk[Seq[String]]]]) {
@@ -36,7 +36,7 @@ class JointDocACE(val rawDoc: ConllDoc,
object JointDocACE {
- def apply(rawDoc: ConllDoc,
+ def apply(rawDoc: Document,
docGraph: DocumentGraph,
maybeGoldWikiChunks: Option[Seq[Seq[Chunk[Seq[String]]]]]) = {
val goldWikiChunks = if (maybeGoldWikiChunks.isDefined) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala
index 667672b..afeb3f7 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala
@@ -3,7 +3,7 @@ package edu.berkeley.nlp.entity.joint
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.ConllDocReader
import edu.berkeley.nlp.entity.ConllDocWriter
import edu.berkeley.nlp.entity.GUtil
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala
index 71e9274..cf93562 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala
@@ -12,7 +12,7 @@ import edu.berkeley.nlp.entity.coref.CorefDocAssembler
import scala.collection.mutable.HashMap
import scala.collection.mutable.ArrayBuffer
import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.coref.DocumentGraph
import edu.berkeley.nlp.futile.fig.exec.Execution
import edu.berkeley.nlp.entity.coref.CorefEvaluator
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala
index a0f4c96..0627b42 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala
@@ -1,6 +1,6 @@
package edu.berkeley.nlp.entity.ner
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.coref.Mention
import edu.berkeley.nlp.futile.util.Logger
import edu.berkeley.nlp.entity.coref.CorefSystem
@@ -53,11 +53,11 @@ object NEEvaluator {
}));
}
- def evaluate(goldDocs: Seq[ConllDoc], predDocs: Seq[ConllDoc]) {
+ def evaluate(goldDocs: Seq[Document], predDocs: Seq[Document]) {
evaluateChunks(goldDocs, predDocs.map(_.nerChunks));
}
- def evaluateChunks(goldDocs: Seq[ConllDoc], allPredChunks: Seq[Seq[Seq[Chunk[String]]]]) {
+ def evaluateChunks(goldDocs: Seq[Document], allPredChunks: Seq[Seq[Seq[Chunk[String]]]]) {
var correct = 0;
val correctByLabel = new Counter[String];
var correctSameHead = 0;
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala
index fd9cd40..911ba9c 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala
@@ -2,13 +2,13 @@ package edu.berkeley.nlp.entity.ner
import edu.berkeley.nlp.entity.ConllDocReader
import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
import edu.berkeley.nlp.futile.fig.basic.IOUtils
object NESentenceMunger {
- def writeSentences(file: String, docs: Seq[ConllDoc]) {
+ def writeSentences(file: String, docs: Seq[Document]) {
val out = IOUtils.openOutHard(file);
for (doc <- docs; words <- doc.words) {
out.println(words.foldLeft("")(_ + " " + _).trim);
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala
index 1b7a40f..e73e7c2 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala
@@ -2,7 +2,7 @@ package edu.berkeley.nlp.entity.ner
import scala.collection.mutable.HashMap
import edu.berkeley.nlp.entity.coref.UID
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.GUtil
import edu.berkeley.nlp.futile.fig.basic.Indexer
import edu.berkeley.nlp.entity.Driver
@@ -10,14 +10,14 @@ import edu.berkeley.nlp.futile.util.Logger
trait NerPruner {
- def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]];
+ def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]];
}
@SerialVersionUID(1L)
class NerPrunerFromModel(val nerModel: NerSystemLabeled,
val pruningThreshold: Double) extends NerPruner with Serializable {
- def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = {
+ def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = {
val sentMarginals = nerModel.computeLogMarginals(doc.words(sentIdx).toArray, doc.pos(sentIdx).toArray);
NerPruner.pruneFromMarginals(sentMarginals, nerModel.labelIndexer, pruningThreshold);
}
@@ -28,7 +28,7 @@ class NerPrunerFromMarginals(val nerMarginals: HashMap[UID,Seq[Array[Array[Float
val neLabelIndexer: Indexer[String],
val pruningThreshold: Double) extends NerPruner with Serializable {
- def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = {
+ def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = {
require(nerMarginals.contains(doc.uid), "Doc ID " + doc.uid + " doesn't have precomputed NER marginals" +
" and the NER pruner in this model is configured to rely on these. You need to either change" +
" how you specify the pruner (if training) or use a different model entirely (if testing)");
@@ -42,7 +42,7 @@ class NerPrunerFromMarginalsAndModel(val nerMarginals: HashMap[UID,Seq[Array[Arr
val nerModel: NerSystemLabeled,
val pruningThreshold: Double) extends NerPruner with Serializable {
- def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = {
+ def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = {
val sentMarginals = if (nerMarginals.contains(doc.uid)) {
nerMarginals(doc.uid)(sentIdx)
} else {
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala
index 2d1bb7a..7cf1b43 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala
@@ -2,11 +2,10 @@ package edu.berkeley.nlp.entity.ner
import edu.berkeley.nlp.futile.fig.basic.Indexer
import scala.collection.mutable.ArrayBuffer
import scala.collection.JavaConverters._
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity._
import edu.berkeley.nlp.futile.classify.GeneralLogisticRegression
import edu.berkeley.nlp.entity.coref.CorefSystem
import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.GUtil
import edu.berkeley.nlp.futile.classify.SequenceExample
import edu.berkeley.nlp.futile.fig.basic.IOUtils
import java.io.FileInputStream
@@ -15,12 +14,9 @@ import java.io.File
import java.io.FileOutputStream
import java.io.ObjectOutputStream
import edu.berkeley.nlp.futile.util.Counter
-import edu.berkeley.nlp.entity.Chunk
import scala.collection.mutable.HashMap
-import edu.berkeley.nlp.entity.ConllDocReader
import edu.berkeley.nlp.entity.lang.Language
import scala.util.Random
-import edu.berkeley.nlp.entity.ConllDocWriter
import edu.berkeley.nlp.math.SloppyMath
import edu.berkeley.nlp.entity.wiki.WikipediaInterface
import edu.berkeley.nlp.entity.coref.UID
@@ -194,7 +190,8 @@ object NerSystemLabeled {
// transitionMatrix.map(_.map(arr => if (arr != null) arr.map(featureIndexer.getIndex(_)) else null));
// }
- def replaceNer(doc: ConllDoc, newChunks: Seq[Seq[Chunk[String]]]) = {
+ def replaceNer(doc: Document, newChunks: Seq[Seq[Chunk[String]]]) = {
+ // MFL TODO: ?? need to make it work either way?
new ConllDoc(doc.docID, doc.docPartNo, doc.words, doc.pos, doc.trees, newChunks, doc.corefChunks, doc.speakers);
}
@@ -227,7 +224,7 @@ object NerSystemLabeled {
// TRAINING
- def trainNerSystem(trainDocs: Seq[ConllDoc],
+ def trainNerSystem(trainDocs: Seq[Document],
maybeBrownClusters: Option[Map[String,String]],
nerFeatureSet: Set[String],
reg: Double,
@@ -267,7 +264,7 @@ object NerSystemLabeled {
// EVALUATION
- def evaluateNerSystem(nerSystem: NerSystemLabeled, testDocs: Seq[ConllDoc]) {
+ def evaluateNerSystem(nerSystem: NerSystemLabeled, testDocs: Seq[Document]) {
val labelIndexer = nerSystem.labelIndexer;
Logger.logss("Extracting test examples");
val testExamples = extractNerChunksFromConll(testDocs);
@@ -332,7 +329,7 @@ object NerSystemLabeled {
}
}
- def extractNerChunksFromConll(docs: Seq[ConllDoc]): Seq[NerExample] = {
+ def extractNerChunksFromConll(docs: Seq[Document]): Seq[NerExample] = {
val chunkTypeCounts = new Counter[String];
val examples = docs.flatMap(doc => {
val chunksToUse = doc.nerChunks
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java b/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java
index 1d3a0d7..78fba09 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java
@@ -12,6 +12,7 @@
import edu.berkeley.nlp.PCFGLA.TreeAnnotations;
import edu.berkeley.nlp.entity.ConllDocJustWords;
import edu.berkeley.nlp.entity.ConllDocReader;
+import edu.berkeley.nlp.entity.WikiDocReader;
import edu.berkeley.nlp.entity.lang.Language;
import edu.berkeley.nlp.entity.ner.NerSystemLabeled;
import edu.berkeley.nlp.futile.fig.basic.IOUtils;
@@ -92,7 +93,7 @@ public class PreprocessingDriver implements Runnable {
public static boolean useAlternateTokenizer = false;
public static enum Mode {
- RAW_TEXT, CONLL_JUST_WORDS, REDO_CONLL;
+ RAW_TEXT, CONLL_JUST_WORDS, REDO_CONLL, WIKILIMITED;
}
public static void main(String[] args) {
@@ -128,6 +129,9 @@ public void run() {
Logger.logss("Processed document " + docName + " and wrote result to " + outputDir);
}
writer.close();
+ } else if (mode == Mode.WIKILIMITED) {
+ WikiDocReader docReader = new WikiDocReader(Language.ENGLISH, "");
+ WikiPreprocessor.processesDocs(inputDir + "/", outputDir + "/", docReader, splitter, parser, backoffParser, nerSystem);
} else {
ConllDocReader docReader = new ConllDocReader(Language.ENGLISH, "");
for (File inputFile : new File(inputDir).listFiles()) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala
index 19ac409..9e8ee9e 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala
@@ -1,7 +1,7 @@
package edu.berkeley.nlp.entity.preprocess
import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import java.io.PrintWriter
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala
index 8ac70d1..85c7a97 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala
@@ -1,5 +1,5 @@
package edu.berkeley.nlp.entity.preprocess
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
import edu.berkeley.nlp.entity.coref.CorefSystem
import scala.io.Source
import scala.collection.mutable.ArrayBuffer
@@ -99,8 +99,8 @@ object SentenceSplitter {
def featurize(featureIndexer: Indexer[String], addToIndexer: Boolean): Array[Int] = {
val featStrs = new ArrayBuffer[String];
- val pw = prevWord;
- val fw = followingWord;
+ val pw = if(prevWord.isEmpty) " " else prevWord
+ val fw = if (followingWord.isEmpty) " " else followingWord
val fwcls = (if (Character.isUpperCase(fw.charAt(0))) "UC" else if (Character.isLowerCase(fw.charAt(0))) "LC" else if (!Character.isLetterOrDigit(fw.charAt(0))) "PU" else "OTHER");
featStrs += ("Bias=1");
featStrs += ("LastChar=" + pw.last);
@@ -242,7 +242,7 @@ object SentenceSplitter {
}
- private def readExamplesFromConll(docs: Seq[ConllDoc]): Seq[SplitExample] = {
+ private def readExamplesFromConll(docs: Seq[Document]): Seq[SplitExample] = {
// N.B. we only loop up until size - 1 since the end of the last sentence
// has no following context and isn't a good training example.
// We extract pretty much all positives except for really weird stuff.
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
new file mode 100644
index 0000000..21af271
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -0,0 +1,282 @@
+package edu.berkeley.nlp.entity.preprocess
+
+import java.io.File
+
+import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
+import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder
+import edu.berkeley.nlp.entity._
+import edu.berkeley.nlp.entity.ner.NerSystemLabeled
+import edu.berkeley.nlp.futile.util.Logger
+import edu.berkeley.nlp.futile.syntax.Tree
+import edu.berkeley.nlp.futile.fig.basic.Indexer
+import edu.berkeley.nlp.futile.fig.basic.IOUtils
+
+import scala.collection.mutable.{ArrayBuffer, ListBuffer}
+import scala.xml._
+import scala.concurrent._
+import scala.collection.JavaConverters._
+
+import ExecutionContext.Implicits.global
+
+/**
+ * Created by matthew on 2/21/15.
+ */
+object WikiPreprocessor {
+
+ val headFinder = new ModCollinsHeadFinder()
+
+ def processesDocs (inputDir : String, outputDir : String,
+ docReader : WikiDocReader,
+ splitter : SentenceSplitter,
+ parser : CoarseToFineMaxRuleParser,
+ backoffParser : CoarseToFineMaxRuleParser,
+ nerSystem : NerSystemLabeled) = {
+ val wikiDocs = new File(inputDir).listFiles.par.map(file => {
+ val input_file = file.getAbsolutePath
+ val output_file = outputDir + file.getName
+ try {
+ process(input_file, output_file, docReader, splitter, parser.newInstance, backoffParser.newInstance, nerSystem)
+ } catch {
+ case e : Exception => {
+ Logger.logss("failed file: "+input_file)
+ System.err.print(e.toString)
+ e.printStackTrace(System.err)
+ null
+ }
+ }
+ }).filter(_ != null).toList
+ GUtil.save(wikiDocs.asInstanceOf[Serializable], outputDir + "wiki-docs.doc.ser.gz")
+ }
+
+ def process(inputFile : String, outputFile : String,
+ docReader : WikiDocReader,
+ splitter : SentenceSplitter,
+ parser : CoarseToFineMaxRuleParser,
+ backoffParser : CoarseToFineMaxRuleParser,
+ nerSystem : NerSystemLabeled) : WikiDoc = {
+ val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem)
+ val lines = wikiToConllLines(wdoc)
+ //val wlines = wiki.WikiAnnotReaderWriter.getWikiBits(wdoc.words.map(_.size), wdoc.wikiRefChunks)
+ val wlines = wikiToWikiLines(wdoc)
+ //PreprocessingDriver.writeConllLines(wdoc.docID, lines.map(_.toArray).toArray, outputFile)
+ writeWikiLines(wdoc.docID, lines, outputFile)
+ writeWikiLines(wdoc.docID, wlines, outputFile.replace("raw", "wiki"))
+ wdoc
+ }
+
+ def writeWikiLines(docID : String, lines : Seq[Seq[String]], outputFile : String) = {
+ var writer = IOUtils.openOutHard(outputFile)
+ writer.println("#begin document (" + docID + "); part 000")
+ lines.foreach(l => {
+ l.foreach(writer.println(_))
+ writer.println
+ })
+ writer.close()
+ }
+
+ def wikiToConllLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+ val ret = ListBuffer[Seq[String]]()
+ //ret.append("#begin document (" + wdoc.docID + "); part " + wdoc.docPartNo)
+ for(i <- 0 until wdoc.numSents) {
+ val parseBits = PreprocessingDriver.computeParseBits(Reprocessor.convertFromFutileTree(wdoc.trees(i).constTree))
+ //val nerBits = PreprocessingDriver.computeNerBits(wdoc.nerChunks(i).toArray)
+ val corefBits = computeBits(wdoc.corefChunks(i), wdoc.words(i).size)
+ var lines = new ListBuffer[String]()
+ // conll: [doc name] [part num] [word num] [word] [pos] [parsebit] [6] [7] [8] [speakers] [nerbit] [corefbit]
+ for(j <- 0 until wdoc.words(i).size) {
+ lines.append(wdoc.docID + "\t" +
+ wdoc.docPartNo + "\t" +
+ j + "\t" +
+ wdoc.words(i)(j) + "\t" +
+ wdoc.pos(i)(j) + "\t" +
+ parseBits(j) + "\t" +
+ "\t-\t-\t-\t" +
+ "-\t" + // speakers
+ "*\t" + // nerbit
+ corefBits(j) + "\t" // coref bits
+ )
+ }
+ ret.append(lines.toSeq)
+ }
+ ret.toSeq
+ }
+
+ def computeBits[T](items : Seq[Chunk[T]], len : Int) : Array[String] = {
+ var ret = Array.fill(len)(List[String]())
+ items.foreach(c => {
+ if(c.start == c.end -1) {
+ ret(c.start) = ret(c.start) :+ ("(" + c.label + ")")
+ } else {
+ ret(c.start) = ret(c.start) :+ ("(" + c.label)
+ ret(c.end - 1) = ret(c.end - 1) :+ (c.label + ")")
+ }
+ })
+ ret.map(i => {if(i.isEmpty) "-" else i.reduce(_+"|"+_)})
+ }
+
+ /*def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+ // this does not handle multiple chunks on the same span well, but that shouldn't be an issue, since wiki docs shouldn't have that
+ val ret = ListBuffer[Seq[String]]()
+ for(i <- 0 until wdoc.numSents) {
+ val lines = new ListBuffer[String]()
+ for(j <- 0 until wdoc.words(i).size) {
+ var s = ""
+ wdoc.wikiRefChunks(i).foreach(c => {
+ if(c.start == j)
+ s = "(" + c.label
+ })
+ s += "*"
+ wdoc.wikiRefChunks(i).foreach(c => {
+ if(c.end == j + 1)
+ s += ")"
+ })
+ lines.append(s)
+ }
+ ret.append(lines.toSeq)
+ }
+ ret.toSeq
+ }*/
+
+ def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+ for (sentIdx <- 0 until wdoc.words.size) yield {
+ for (tokenIdx <- 0 until wdoc.words(sentIdx).size) yield {
+ val chunksStartingHere = wdoc.wikiRefChunks(sentIdx).filter(chunk => chunk.start == tokenIdx).sortBy(- _.end);
+ val numChunksEndingHere = wdoc.wikiRefChunks(sentIdx).filter(chunk => chunk.end - 1 == tokenIdx).size;
+ var str = if(chunksStartingHere.isEmpty) "" else {
+ chunksStartingHere.map("("+_.label.replace("(", "-LRB-").replace(")", "-RRB-").replace("*", "-STAR-")).reduce(_+"|"+_)
+ }
+ str += "*";
+ str += ")" * numChunksEndingHere
+ str;
+ }
+ }
+ }
+
+
+ def mkWikiDoc(inputFile : String,
+ docReader : WikiDocReader,
+ splitter : SentenceSplitter,
+ parser : CoarseToFineMaxRuleParser,
+ backoffParser : CoarseToFineMaxRuleParser,
+ nerSystem : NerSystemLabeled) : WikiDoc = {
+
+ Logger.logss("starting processing of " + inputFile)
+ val referencesFile = inputFile.replace("RawTexts", "Problems")
+ val refxml = XML.loadFile(referencesFile)
+ val document = scala.io.Source.fromFile(inputFile).mkString.split("\n")
+ val refname = (refxml \ "ReferenceFileName")(0).text.trim
+
+
+ val references = (refxml \ "ReferenceInstance").map(r => (
+ (r \ "SurfaceForm")(0).text.trim,
+ (r \ "Offset")(0).text.trim.toInt,
+ (r \ "Length")(0).text.trim.toInt,
+ (r \ "ChosenAnnotation")(0).text.trim,
+ (r \ "AnnotatorId")(0).text.trim,
+ (r \ "Annotation")(0).text.trim
+ ))
+
+ val canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(document, false, false)
+ val sentences = splitter.splitSentences(canonicalizedParagraphs)
+ val tokens = SentenceSplitter.tokenize(sentences)
+
+
+ val doclenratio = sentences.map(_.size).sum.toFloat / document.map(_.size + 1).sum
+ def refFinder (ref : (String, Int, Int, String, String, String)) : (Int, Chunk[String]) = {
+ val d = doclenratio * (ref._2 + ref._3 / 2.0)
+ var cnt = 0
+ val wrds = ref._1.replace(" ", "")
+
+ if(wrds.isEmpty) // wtf, how does not create an empty citation???
+ return (-1, null)
+
+ def rank_match(i : Int, j : Int) : Double = {
+ val res = tokens(i).drop(j).reduce(_+_)
+ for(q <- 0 until Math.min(wrds.size, res.size)) {
+ if (res(q) != wrds(q))
+ return q.toDouble / wrds.size
+ }
+ 1.0
+ }
+ var best_start = 0
+ var best_rank = Double.NegativeInfinity
+ var best_sentence = 0
+ for(i <- 0 until sentences.size) {
+ var tcnt = 0
+ for(j <- 0 until tokens(i).size) {
+ val r = rank_match(i, j) / Math.log(Math.abs(d - cnt - tcnt) + 2) // little to simple, but works in most cases
+ if(r > best_rank) {
+ best_rank = r
+ best_start = j
+ best_sentence = i
+ }
+ tcnt += tokens(i)(j).size + 1 // +1 to match the space
+ }
+ cnt += sentences(i).size
+ }
+ var len = 0
+ var len_cnt = 0
+ for(j <- best_start until tokens(best_sentence).size; if len_cnt < wrds.size) {
+ len_cnt += tokens(best_sentence)(j).size
+ len += 1
+ }
+ if(len == 0)
+ return (-1, null)
+ (best_sentence, new Chunk(best_start, best_start + len, ref._4))
+ }
+
+ val refplaces = references.map(refFinder)
+
+ val refsorted = refplaces.foldLeft(Map[Int, List[Chunk[String]]]().withDefaultValue(List()))((m, itm) => {
+ if(itm._1 != -1) {
+ m.updated(itm._1, m(itm._1) :+ itm._2)
+ } else
+ m
+ })
+
+ val parses: Array[Tree[String]] = tokens.map(t => {
+ //try {
+ Reprocessor.convertToFutileTree(
+ PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava))
+ /*} catch {
+ case e : java.lang.NullPointerException => {
+ null;
+ }
+ }*/
+ })
+
+ // ... filter out the ones where the parses don't match, idk how that is going to effect
+ val tps = (tokens, parses, 0 until tokens.size).zipped
+ .filter((a,b,c) => a.length == b.getYield.size)
+
+ val indexer = new Indexer[String]()
+
+ val pos = tps._2.map(t => { new ArrayBuffer[String] ++ t.getPreTerminalYield.asScala })
+
+ val trees = for(i <- 0 until tps._1.size) yield {
+ val childParentMap = DepConstTree.extractDependencyStructure(tps._2(i), headFinder)
+ new DepConstTree(tps._2(i), pos(i), tps._1(i), childParentMap)
+ }
+
+ val empty = tps._1.map(l => (0 until l.length).map(a=>"-")).toSeq
+
+ val wikiDoc = new WikiDoc(
+ docID=inputFile,
+ docPartNo=refname.toInt,
+ words=tps._1.toSeq.map(_.toSeq),
+ pos=pos,
+ trees=trees,
+ nerChunks=tps._1.map(a=>Seq()), // todo
+ corefChunks=tps._3.map(i => {
+ refsorted(i).map(c => new Chunk(c.start, c.end, indexer.getIndex(c.label)))
+ }),
+ speakers=empty, // todo?
+ wikiRefChunks=tps._3.map(refsorted(_))
+ )
+
+ Logger.logss("done with "+inputFile)
+
+ wikiDoc
+ }
+
+}
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala
new file mode 100644
index 0000000..acd575e
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala
@@ -0,0 +1,42 @@
+package edu.berkeley.nlp.entity.wiki
+
+import edu.berkeley.nlp.futile.LightRunner
+
+/**
+ * Created by matthewfl
+ *
+ * We want to work with the who document at a time rather then just a single link
+ * this will allow us to
+ */
+class DocumentedSetChooser {
+
+}
+
+
+object DocumentedSetChooser {
+
+ val trainDataPath = "data/ace05/train";
+ val testDataPath = "data/ace05/dev";
+ val wikiPath = "data/ace05/ace05-all-conll-wiki" // contains the wiki links for both items
+ val wikiDBPath = "models/wiki-db-ace.ser.gz"
+
+ val lambda = 1e-8F
+ val batchSize = 1
+ val numItrs = 20
+
+ val maxNumWikificationOptions = 20 //7
+
+ val numLoadedSamples = -1 // for debugging by loading less samples
+
+
+ def main(args: Array[String]) = {
+ LightRunner.initializeOutput(DocumentedSetChooser.getClass)
+ LightRunner.populateScala(DocumentedSetChooser.getClass, args)
+
+ // load the documents
+
+
+
+ LightRunner.finalizeOutput()
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 4d03771..b145732 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -3,16 +3,14 @@ package edu.berkeley.nlp.entity.wiki
import edu.berkeley.nlp.entity.lang.Language
import edu.berkeley.nlp.futile.LightRunner
import edu.berkeley.nlp.entity.coref.CorefDocAssembler
-import edu.berkeley.nlp.entity.ConllDocReader
+import edu.berkeley.nlp.entity._
import edu.berkeley.nlp.entity.coref.MentionPropertyComputer
-import edu.berkeley.nlp.entity.GUtil
import edu.berkeley.nlp.futile.fig.basic.Indexer
import edu.berkeley.nlp.entity.joint.LikelihoodAndGradientComputer
import scala.collection.mutable.ArrayBuffer
import edu.berkeley.nlp.entity.coref.CorefDoc
import edu.berkeley.nlp.futile.math.SloppyMath
import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.Chunk
import edu.berkeley.nlp.entity.joint.GeneralTrainer
/**
@@ -31,6 +29,8 @@ case class JointQueryDenotationExample(val queries: Seq[Query],
// Feature caches since feature computation is expensive if redone every time
var cachedFeatsEachQuery: Array[Array[Int]] = null;
var cachedFeatsEachQueryDenotation: Array[Array[Array[Int]]] = null;
+
+ def document = queries.head.originalMent.rawDoc
}
/**
@@ -42,11 +42,13 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
val featureIndexer: Indexer[String]) extends LikelihoodAndGradientComputer[JointQueryDenotationExample] {
// Used for feature computation
val queryChooser = new QueryChoiceComputer(wikiDB, featureIndexer)
-
+
def featurizeUseCache(ex: JointQueryDenotationExample, addToIndexer: Boolean) {
if (ex.cachedFeatsEachQuery == null) {
+ if(ex.document.documentVectorCache == null)
+ ex.document.documentVectorCache = wikiDB.textDB.makeVector(ex.document.words)
ex.cachedFeatsEachQuery = queryChooser.featurizeQueries(ex.queries, addToIndexer)
- ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations(ex.queries, ex.allDenotations, addToIndexer)
+ ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations_GLOW(ex.queries, ex.allDenotations, addToIndexer, wikiDB)
}
}
@@ -55,13 +57,17 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
*/
def getUnnormalizedJointScores(ex: JointQueryDenotationExample, weights: Array[Float]): Array[Array[Float]] = {
featurizeUseCache(ex, false)
+ // each example will have a number of features associated with each query
+ // each feature is an indicator, so we use the cache of the features indexes
+ // and sum the values of the features
val rawQueryScores = ex.cachedFeatsEachQuery.map(feats => GUtil.scoreIndexedFeats(feats, weights));
+ // these are the weights from each query wrt the various word choices
val queryDenotationMatrix = ex.cachedFeatsEachQueryDenotation.map(_.map(feats => GUtil.scoreIndexedFeats(feats, weights)));
val scores = Array.tabulate(ex.queries.size, ex.allDenotations.size)((i, j) => Float.NegativeInfinity)
- for (queryIdx <- 0 until ex.queries.size) {
- for (denotationIdx <- 0 until ex.allDenotations.size) {
- scores(queryIdx)(denotationIdx) = rawQueryScores(queryIdx) + queryDenotationMatrix(queryIdx)(denotationIdx)
- }
+ for (queryIdx <- 0 until ex.queries.size; denotationIdx <- 0 until ex.allDenotations.size) {
+ // These are indicator weights, so by summing them we can compute the resulting value of choosing a given word
+ // and a given query by combining the results of the dot product of the query and the denotation
+ scores(queryIdx)(denotationIdx) = rawQueryScores(queryIdx) + queryDenotationMatrix(queryIdx)(denotationIdx)
}
scores
}
@@ -72,7 +78,9 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
*/
def getDenotationLogMarginals(ex: JointQueryDenotationExample, weights: Array[Float]): Array[Float] = {
val scores = getUnnormalizedJointScores(ex, weights)
- // Sum up each column
+ // the scores matrix contains log(p_{i,j}), so we are using
+ // logAdd to sum the probabilities
+ // as p(q,d) \propto e^(w^T f(q,d))
val rawDenotationMarginals = Array.tabulate(ex.allDenotations.size)(i => SloppyMath.logAdd(scores.map(_(i))).toFloat)
val normalizer = SloppyMath.logAdd(rawDenotationMarginals).toFloat
(0 until rawDenotationMarginals.size).foreach(i => rawDenotationMarginals(i) -= normalizer)
@@ -132,12 +140,47 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
class JointQueryDenotationChooser(val featureIndexer: Indexer[String],
val weights: Array[Float]) extends Serializable {
- def pickDenotation(queries: Seq[Query], wikiDB: WikipediaInterface): String = {
+ /*def pickDenotation(queries: Seq[Query], wikiDB: WikipediaInterface): String = {
val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
- val denotations = queries.map(query => wikiDB.disambiguateBestNoDisambig(query));
+ val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]());
computer.computeDenotation(ex, weights)
+ }*/
+
+ def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : (Seq[(String, Int)], Array[Array[Int]]) = {
+ val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
+ val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
+ val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions)
+ val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]());
+ val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
+
+ (ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse,
+ ex.cachedFeatsEachQuery)
+ }
+
+ def printEverything(queries: Seq[Query], wikiDB: WikipediaInterface, correctInd: Int) = {
+ // just redo the computations so gg
+ val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
+ val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
+ val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions)
+ val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]());
+ val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
+
+ val sortedItms = ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse
+
+ println(
+ s"""Correct item in $correctInd (${sortedItms(correctInd)._1})
+ |\tGuessed value: ${sortedItms(0)._1}""".stripMargin)
+ for(i <- 0 until queries.length) {
+ println("\t\t"+i+": "+queries(i))
+ println("\t\t"+ex.cachedFeatsEachQuery(i).map(featureIndexer.getObject(_)).mkString(" "))
+ for(j <- 0 until ex.allDenotations.length) {
+ println("\t\t\t"+j+": "+ex.allDenotations(j)+": "+ex.cachedFeatsEachQueryDenotation(i)(j).map(featureIndexer.getObject(_)).mkString(" "))
+ }
+ }
+ println()
}
+
}
object JointQueryDenotationChooser {
@@ -159,11 +202,17 @@ object JointQueryDenotationChooser {
// There are multiple possible gold Wikipedia titles for some mentions. Note that
// NIL (no entry in Wikipedia) is included as an explicit choice, so this includes NILs (as
// it should according to how the task is defined)
- val goldLabel = getGoldWikification(goldWikification(docName), ment)
+ val goldLabelp = getGoldWikification(goldWikification(docName), ment)
+ val goldLabel = (goldLabelp ++ goldLabelp.map(wikiDB.redirectsDB.followRedirect(_))).distinct
if (goldLabel.size >= 1) {
+ //val oldqueries = Query.extractQueriesBest_old(ment, true);
val queries = Query.extractQueriesBest(ment, true);
- val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_));
+ /*if(!(Set(oldqueries.map(_.getFinalQueryStr):_*) subsetOf Set(queries.map(_.getFinalQueryStr):_*))) {
+ println("failed...")
+ }*/
+ //val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_));
// val denotations = queries.map(wikiDB.disambiguateBestNoDisambig(_));
+ val queryDisambigs = queries.map(wikiDB.disambigRes(_))
val denotations = Query.extractDenotationSetWithNil(queries, queryDisambigs, maxNumWikificationOptions);
val correctDenotations = denotations.filter(denotation => isCorrect(goldLabel, denotation))
// N.B. The use of "isCorrect" here is needed to canonicalize
@@ -171,6 +220,10 @@ object JointQueryDenotationChooser {
// if (correctIndices.isEmpty &&
if (filterImpossible && correctIndices.isEmpty) {
numImpossible += 1;
+ //println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations)
+ /*if(goldLabel.contains("Lord_Speaker")) {
+ println("wtfwtf")
+ }*/
} else {
exs += new JointQueryDenotationExample(queries, denotations, correctDenotations, goldLabel)
}
@@ -182,26 +235,52 @@ object JointQueryDenotationChooser {
exs;
}
+
+ def loadDocuments(path : String) = {
+ val limit = numLoadedSamples//500
+ if(path.startsWith("wikiser:")) {
+ WikiDocReader.loadRawWikiDocs(path.split(":")(1), limit, "", Language.ENGLISH)
+ } else {
+ ConllDocReader.loadRawConllDocsWithSuffix(path, limit, "", Language.ENGLISH)
+ }
+ }
+
val trainDataPath = "data/ace05/train";
val testDataPath = "data/ace05/dev";
- val wikiPath = "data/ace05/ace05-all-conll-wiki"
+ val wikiPath = "data/ace05/ace05-all-conll-wiki" // contains the wiki links for both items
val wikiDBPath = "models/wiki-db-ace.ser.gz"
val lambda = 1e-8F
val batchSize = 1
val numItrs = 20
- val maxNumWikificationOptions = 7
+ val maxNumWikificationOptions = 20 //7
+
+ val numLoadedSamples = -1 // for debugging by loading less samples
def main(args: Array[String]) {
LightRunner.initializeOutput(JointQueryDenotationChooser.getClass());
LightRunner.populateScala(JointQueryDenotationChooser.getClass(), args)
// Read in CoNLL documents
val assembler = CorefDocAssembler(Language.ENGLISH, true);
- val trainDocs = ConllDocReader.loadRawConllDocsWithSuffix(trainDataPath, -1, "", Language.ENGLISH);
- val trainCorefDocs = trainDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None)));
-
+ val trainDocs = loadDocuments(trainDataPath);
+ val trainCorefDocs = trainDocs.map(doc => {
+ try {
+ assembler.createCorefDoc(doc, new MentionPropertyComputer(None))
+ } catch {
+ case e : Exception => {
+ // TODO: fix the wikidocument parser
+ println("failed document "+doc.docID)
+ null
+ }
+ }
+ }).filter(_!=null);
+
+ //val testDocs = ConllDocReader.loadRawConllDocsWithSuffix(testDataPath, -1, "", Language.ENGLISH);
+ val testDocs = loadDocuments(testDataPath)
+ val testCorefDocs = testDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None)));
+
// Read in gold Wikification labels
val goldWikification = WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiPath)
// Read in the title given surface database
@@ -209,7 +288,9 @@ object JointQueryDenotationChooser {
// Make training examples, filtering out those with solutions that are unreachable because
// they're not good for training
val trainExs = extractExamples(trainCorefDocs, goldWikification, wikiDB, filterImpossible = true)
-
+
+ // going to have make this system work on a set of a document
+
// Extract features
val featIndexer = new Indexer[String]
val computer = new JointQueryDenotationChoiceComputer(wikiDB, featIndexer);
@@ -225,16 +306,63 @@ object JointQueryDenotationChooser {
// Build the test examples and decode the test set
// No filtering now because we're doing test
- val testDocs = ConllDocReader.loadRawConllDocsWithSuffix(testDataPath, -1, "", Language.ENGLISH);
- val testCorefDocs = testDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None)));
- val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = false);
- val goldTestDenotationsAsTrivialChunks = (0 until testExs.size).map(i => new Chunk[Seq[String]](i, i+1, testExs(i).rawCorrectDenotations))
- val predTestDenotationsAsTrivialChunks = (0 until testExs.size).map(i => new Chunk[String](i, i+1, chooser.pickDenotation(testExs(i).queries, wikiDB)))
+
+ val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = true)//false);
+
+ var correctItemWasInSet = 0
+
+ val results = testExs.map(t => {
+ // TODO: need more then one perdicted title
+ val (picks, denFeats) = chooser.pickDenotations(t.queries, wikiDB)
+ if(!isCorrect(t.rawCorrectDenotations, picks(0)._1)) {
+ // the pick is not correct, attempt to determine if there would have
+ // been a better pick that is in the picks list (which basically means all of the
+ /*if(picks.size > 1 && isCorrect(t.rawCorrectDenotations, picks(1))) {
+ // the correct pick was the second answer instead of the first one
+ // try and report the differences between the two items
+ println("second pick was correct")
+
+ }*/
+ var qq = -1
+ for((p, i) <- picks.zipWithIndex) {
+ // try: t.correctDenotations here?
+ if(isCorrect(t.correctDenotations, p._1) || isCorrect(t.rawCorrectDenotations, p._1)) {
+ //println("Found correct item with "+i)
+ correctItemWasInSet += 1
+ qq = i
+ //println("found correct item")
+ }
+ }
+ if(qq != -1) {
+ chooser.printEverything(t.queries, wikiDB, qq)
+ /*println(
+ s"""Correct item in place: $qq
+ |\tcorrect value: ${picks(qq)}
+ |\t\t${denFeats(picks(qq)._2).flatMap(featIndexer.getObject(_)).mkString(" ")}
+ |\tchosen value : ${picks(0)}
+ |\t\t${denFeats(picks(0)._2).flatMap(featIndexer.getObject(_)).mkString(" ")}
+ """.stripMargin)
+*/
+ } else {
+ println("THIS QUERY SHOULD HAVE BEEN FILTERED")
+ }
+ }
+ (t.rawCorrectDenotations, picks.map(_._1), t.queries(0).originalMent.rawDoc)
+ })
+
+ val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1))
+ val predTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[String](i, i+1, results(i)._2(0)))
// Hacky but lets us reuse some code that normally evaluates things with variable endpoints
// WikificationEvaluator.evaluateWikiChunksBySent(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks))
WikificationEvaluator.evaluateFahrniMetrics(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks), Set())
-
+
+ val mentionsByDoc = results.groupBy(_._3)
+
+ WikificationEvaluator.evaluateBOTF1_mfl(mentionsByDoc)
+ println("Number of correct items that were in the set: "+correctItemWasInSet)
+
+
LightRunner.finalizeOutput();
}
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
index ce86957..71a1869 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
@@ -16,7 +16,8 @@ case class Query(val words: Seq[String],
val originalMent: Mention,
val finalSpan: (Int, Int),
val queryType: String,
- val removePuncFromQuery: Boolean = true) {
+ val removePuncFromQuery: Boolean = true,
+ val features: List[String] = List[String]()) {
def getFinalQueryStr = {
val wordsNoPunc = if (removePuncFromQuery) {
@@ -40,9 +41,16 @@ object Query {
val PluralQueryExpand = true;
val RemovePuncFromQuery = true;
val UseFirstHead = true;
- val MaxQueryLen = 4;
- val BlackList = Set("the", "a", "my", "your", "his", "her", "our", "their", "its", "this", "that", "these", "those")
- val PuncList = Set(',', '.', '!', '?', ':', ';', '\'', '"', '(', ')', '[', ']', '{', '}', ' ');
+ val MaxQueryLen = 8;
+ val BlackList = Set(
+ "the", "a", "my", "your", "his", "her", "our",
+ "their", "its", "this", "that", "these", "those",
+ "of"
+ )
+ val PuncList = Set(
+ ',', '.', '!', '?', ':', ';', '\'', '"', '(', ')',
+ '[', ']', '{', '}', ' '
+ )
/**
* Check if a token is "blacklisted", meaning that we shouldn't form a query that starts with
@@ -73,7 +81,7 @@ object Query {
* considering different subsets of the words in the mention and munging capitalization and
* stemming, since lowercasing and dropping a plural-marking "s" are useful for nominals.
*/
- def extractQueriesBest(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = {
+ def extractQueriesBest_old(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = {
val queries = new ArrayBuffer[Query];
val mentWords = ment.words;
// Try the whole query, then prefixes ending in the head
@@ -92,6 +100,7 @@ object Query {
if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) {
queriesThisSlice += new Query(Seq(wikiCase(firstWord)) ++ mentWords.slice(indices._1 + 1, indices._2), ment, indices, "WIKICASED", RemovePuncFromQuery);
}
+
// Stemming (but only on head alone)
if (PluralQueryExpand && (indices._2 - indices._1) == 1 && firstWord.last == 's') {
queriesThisSlice ++= queriesThisSlice.map(query => new Query(Seq(removePlural(query.words(0))), ment, indices, query.queryType + "-STEM", RemovePuncFromQuery));
@@ -107,6 +116,52 @@ object Query {
// }
queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]());
}
+
+ def extractQueriesBest(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = {
+ val queries = new ArrayBuffer[Query]()
+ val mentWords = ment.words
+ val relHeadIdx = ment.contextTree.getSpanHeadACECustom(ment.startIdx, ment.endIdx) - ment.startIdx
+ def addQuery(start: Int, end: Int, featsi:List[String]): Unit = {
+ var feats = featsi // gaaaaa
+ val thisSlice = new ArrayBuffer[Query]()
+ val wrds = mentWords.slice(start, end)
+ thisSlice += new Query(wrds, ment, (start, end), "STD", true, feats ++ List("RemovedPunc"))
+ thisSlice += new Query(wrds, ment, (start, end), "STD", false, feats ++ List("IncludePunc"))
+ val firstWord = wrds(0)
+ val lastWord = wrds(wrds.size - 1)
+ if((end - start)== 1)
+ feats ++= List("SingleItemQuery")
+ if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) {
+ thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", true, feats ++ List("RemovedPunc"));
+ thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", false, feats ++ List("IncludePunc"));
+ }
+ // Stemming (but only on head alone)
+ if (PluralQueryExpand && (end - start) == 1 && firstWord.last == 's') {
+ thisSlice ++= thisSlice.map(qu =>
+ new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", true, feats ++ List("RemovedPunc")));
+ thisSlice ++= thisSlice.map(qu =>
+ new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", false, feats ++ List("IncludePunc")));
+
+ }
+ queries ++= thisSlice
+ }
+ addQuery(0, ment.endIdx - ment.startIdx, List("SimpleQuery", "FullTextQuery"))
+ // TODO: make this ignore items that simply add a blacklisted word
+ for(i <- 0 to relHeadIdx) {
+ addQuery(i, relHeadIdx + 1, List("SimpleQuery", "PreHeadQuery"))
+ }
+ for(i <- relHeadIdx+1 until mentWords.size) {
+ addQuery(relHeadIdx, i, List("SimpleQuery", "PostHeadQuery"))
+ }
+ // try filtering words
+ val filterWords = mentWords.filter(!isBlacklisted(_, 0))
+ if(filterWords.size != mentWords.size) {
+ // we lost something, make new query
+ queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", true , List("FilteredQuery", "RemovedPunc"))
+ queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", false, List("FilteredQuery", "IncludePunc"))
+ }
+ queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]())
+ }
def extractDenotationSetWithNil(queries: Seq[Query], queryDisambigs: Seq[Counter[String]], maxDenotations: Int): Seq[String] = {
val choicesEachQuery = queryDisambigs.map(_.getSortedKeys().asScala);
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
index e3b4d32..712ad3d 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
@@ -18,6 +18,8 @@ import edu.berkeley.nlp.entity.ConllDocReader
import edu.berkeley.nlp.entity.coref.CorefDocAssembler
import edu.berkeley.nlp.entity.coref.MentionPropertyComputer
+import scala.collection.mutable
+
case class QueryChoiceExample(val queries: Seq[Query],
val denotations: Seq[String],
val correctQueryIndices: Array[Int]) {
@@ -123,6 +125,176 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
val longQuery = tagsWithin.size > 3;
feat("DescriptorQueryTags=" + queryDescriptor + "-" + contextTag + (if (longQuery) "...") + tagsWithin.slice(Math.max(0, tagsWithin.size - 3), tagsWithin.size).toString);
feat("DescriptorHead=" + queryDescriptor + "-" + binSize(querySize) + "-" + ment.headStringLc);
+ for(f <- query.features)
+ feat(f)
+ feats.toArray;
+ });
+ }
+
+ def getDentationLinksSets(denotations: Seq[String], wikiDB: WikipediaInterface) : (Seq[Set[Int]], Seq[Set[Int]]) = {
+ (denotations.map(wikiDB.linksDB.getInLinksSetUseCache(_)), denotations.map(wikiDB.linksDB.getOutLinksSetUseCache(_)))
+ }
+
+ val logsv = (0 until 3000).map(Math.log(_))
+
+ def logs(i: Int) = {
+ if(i < logsv.size)
+ logsv(i)
+ else
+ Math.log(i)
+ }
+
+ def unionSize[T](ss: Set[T]*) = {
+ val ns = new mutable.HashSet[T]()
+ for(s <- ss) {
+ ns ++= s
+ }
+ ns.size
+ }
+
+ def intersectSize[T](a: Set[T], b: Set[T]) = {
+ var smaller: Set[T] = a
+ var larger: Set[T] = b
+ if(a.size > b.size) {
+ larger = a
+ smaller = b
+ }
+ var ret = 0
+ for(i <- smaller) {
+ if(larger.contains(i))
+ ret += 1
+ }
+ ret
+ }
+
+ /*def NGD[T](a: Set[T], b: Set[T], wsize: Int) : Double = {
+ (logs(math.max(a.size, b.size)) - logs(intersectSize(a,b))) /
+ (logs(wsize) - logs(math.min(a.size,b.size)))
+ }
+
+ def PMI[T](a: Set[T], b: Set[T], wsize: Int) : Double = {
+ // TODO: ? the use of wsize here does not make since
+ // must be misunderstanding something
+ (intersectSize(a,b) * wsize).asInstanceOf[Float] / (a.size * b.size)
+ }
+
+ def GLOWfeatures[T](fn: (Set[T], Set[T], Int) => Double, refs: Seq[Set[T]], prefix: String): Seq[Array[String]] = {
+ val rsize = refs.size
+ val wsize = unionSize(refs:_*)
+ var max = Double.NegativeInfinity
+ var avg = 0.0
+ // TODO: rank the items in the list
+ //val valList = new mutable.MutableList[Double]()
+ val cache = new mutable.HashMap[Int,Double] {
+ override def initialSize: Int = rsize*rsize
+ }
+ for(a <- 0 until rsize; b <- 0 until rsize) {
+ if(a != b) {
+ val v = fn(refs(a), refs(b), wsize)
+ cache.put(a + b*65536, v)
+ if(v > max)
+ max = v
+ //valList += v
+ avg += v
+ }
+ }
+ avg /= (rsize * (rsize - 1))
+ for(a <- 0 until rsize) yield {
+ var isInMax = false
+ var isAboveAvg = false
+ var isAboveAvg2 = false
+ for(b <- 0 until rsize) {
+ if(a != b) {
+ //val v = fn(refs(a),refs(b),wsize)
+ val v : Double = cache.getOrElse(a + b*65536, 0.0)
+ if(v == max) {
+ isInMax = true
+ }
+ if(v > avg) {
+ isAboveAvg = true
+ }
+ if(v > (avg * 2)) {
+ isAboveAvg2 = true
+ }
+ }
+ }
+ val r = new ArrayBuffer[String]
+ if(isInMax)
+ r += prefix + "IsInMax"
+ if(isAboveAvg)
+ r += prefix + "isAboveAvg"
+ if(isAboveAvg2)
+ r += prefix + "isAboveAvg2"
+ r.toArray
+ }
+ }*/
+
+ def featurizeQueriesAndDenotations_GLOW(queries: Seq[Query], denotations: Seq[String], addToIndexer: Boolean, wikiDB: WikipediaInterface): Array[Array[Array[Int]]] = {
+ val queryOutcomes = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
+ val queryNonemptyList = queryOutcomes.map(_.isEmpty);
+ val ment = queries.head.originalMent;
+ val mentUpToHeadSize = ment.headIdx - ment.startIdx + 1;
+ /*val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB)
+
+ val PMINGDvals = Seq(
+ GLOWfeatures[Int](PMI, refLinksIn, "PMI-in-"),
+ GLOWfeatures[Int](NGD, refLinksIn, "NGD-in-"),
+ GLOWfeatures[Int](PMI, refLinksOut, "PMI-out-"),
+ GLOWfeatures[Int](NGD, refLinksOut, "NGD-out-")
+ )*/
+
+ // TODO: this is not correct,.....
+ // we need to know what we are going to annonate stuff in the document with,
+ // these are going to be denotations for a single example, which won't be useful
+ // so we need to get all the possible annontations for a given document
+ //
+ // in the wikification paper they have something that is choosing the references together
+ // need to look at pairs of references and
+
+ val denotationSim = denotations.map(t => wikiDB.textDB.compareDocumentC(ment.rawDoc.documentVectorCache, t))
+ val denotationSimMax = denotationSim.max
+ val denotationSimAvg = denotationSim.sum / denotationSim.size
+
+ // TODO: implement the local vector features which compare the text of the pages
+ // the context can be the set of items linking into/outof a page? but then that isn't the similarity
+
+
+
+ Array.tabulate(queries.size, denotations.size)((queryIdx, denIdx) => {
+ val feats = new ArrayBuffer[Int];
+ def feat(str: String) = addFeat(str, feats, addToIndexer);
+ /*for(p <- PMINGDvals)
+ for(f <- p(denIdx))
+ feat(f)
+ */
+ val query = queries(queryIdx);
+ val den = denotations(denIdx);
+ if (den == NilToken) {
+ feat("NilAndQueryNonempty=" + queryNonemptyList(queryIdx));
+ } else if (queryOutcomes(queryIdx).containsKey(den)) {
+ val queryDescriptorWithProper = (if (ment.pos(ment.headIdx - ment.startIdx) == "NNP") "PROP" else "NOM") + "-" + query.queryType;
+ val queryRank = queryOutcomes(queryIdx).getSortedKeys().indexOf(den);
+ feat("Rank=" + queryDescriptorWithProper + "-" + (queryRank + 1))
+ val queryStr = query.getFinalQueryStr;
+ val matchesQuery = den.toLowerCase == queryStr.toLowerCase;
+ feat("MatchesQuery=" + queryDescriptorWithProper + "-" + matchesQuery)
+ if (!matchesQuery) {
+ feat("ContainsQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.contains(queryStr.toLowerCase)));
+ feat("StartsWithQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.startsWith(queryStr.toLowerCase)));
+ feat("EndsWithQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.endsWith(queryStr.toLowerCase)));
+ }
+ val denotationHasParenthetical = den.contains("(") && den.endsWith(")");
+ feat("ContainsParenthetical=" + queryDescriptorWithProper + "-" + denotationHasParenthetical);
+ if (denotationHasParenthetical) {
+ feat("MatchesQueryUpToParen=" + queryDescriptorWithProper + "-" + (den.substring(0, den.indexOf("(")).trim.toLowerCase == queryStr.toLowerCase))
+ }
+ feat("CompariableWordsLog="+Math.floor(Math.log(denotationSim(denIdx))))
+ feat("CompariableIsMaxWordSim=" + (denotationSim(denIdx) == denotationSimMax))
+ feat("CompariableWordsAboveAvg=" + (denotationSim(denIdx) > denotationSimAvg))
+ feat("CompariableWordsReweight="+Math.floor(denotationSim(denIdx) / denotationSimMax * 10))
+ } else {
+ feat("Impossible");
+ }
feats.toArray;
});
}
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
index cdb1566..c4ad61f 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
@@ -1,13 +1,14 @@
package edu.berkeley.nlp.entity.wiki
-import edu.berkeley.nlp.entity.Chunk
+import edu.berkeley.nlp.entity.{Document, Chunk, GUtil}
import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.GUtil
import edu.berkeley.nlp.futile.util.Counter
import scala.collection.JavaConverters._
import edu.berkeley.nlp.entity.joint.JointDocACE
import java.io.PrintWriter
+import scala.collection.mutable.ArrayBuffer
+
object WikificationEvaluator {
def removeExcludes(chunks: Seq[Chunk[String]]) = chunks.filter(chunk => chunk.label != ExcludeToken)
@@ -78,6 +79,48 @@ object WikificationEvaluator {
}
Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom));
}
+
+
+ // create sets of all the gold document references, and all the documents
+ // that we generate, and then compute an F1
+ def evaluateBOTF1_mfl(results : Map[Document, Seq[(Seq[String], Seq[String], Document)]]) = {
+ // f1 = 2 * precision * recall / (percison + recall)
+ var correct = 0
+ var precDenom = 0
+ var recDenom = 0
+ for((doc, matches) <- results) {
+ var seenBefore = Set[String]()
+ val allGold = Set(matches.flatMap(_._1):_*)
+ val allChoosen = Set(matches.map(_._2(0)):_*) //Set(matches.flatMap(_._2):_*)
+
+ /*for((gold, selected, _) <- matches) {
+ val goldS = Set(gold:_*)
+ val selectedS = Set(selected(0)) //Set(selected:_*)
+ val ints = goldS & selectedS
+ //if(!ints.subsetOf(seenBefore)) {
+ correct += ints.size
+ seenBefore ++= ints
+ //}
+ }*/
+ // TODO: something wrong with computing the set intersection
+
+ val dprecDenom = allChoosen.size
+ val drecDenom = allGold.size
+ var dcorrect = 0
+ allChoosen.foreach(c => {
+ if(isCorrect(allGold.toSeq, c))
+ dcorrect += 1
+ })
+ //val diff = (allGold ++ allChoosen) -- (allGold & allChoosen)
+ //val dcorrect = (allGold & allChoosen).size
+ //Logger.logss("Document f1: "+GUtil.renderPRF1(dcorrect, dprecDenom, drecDenom))
+ precDenom += dprecDenom
+ recDenom += drecDenom
+ correct += dcorrect
+ }
+ Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom))
+ }
+
def convertChunksToBagOfTitles(titles: Iterable[Seq[Chunk[String]]]): Set[String] = {
val bagOfTitles = titles.flatMap(sentTitles => {
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index 88f9ff2..d87fbfe 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -5,9 +5,8 @@ import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
-import edu.berkeley.nlp.entity.ConllDocReader
+import edu.berkeley.nlp.entity.{WikiDocReader, ConllDocReader, GUtil}
import edu.berkeley.nlp.entity.coref.CorefDocAssembler
-import edu.berkeley.nlp.entity.GUtil
import edu.berkeley.nlp.entity.coref.Mention
import edu.berkeley.nlp.entity.coref.MentionPropertyComputer
import edu.berkeley.nlp.entity.lang.Language
@@ -27,7 +26,7 @@ import edu.berkeley.nlp.entity.wiki._
* java -cp /path/to/jar -Xmx8g edu.berkeley.nlp.entity.wiki.WikipediaInterface \
* -datasetPaths path/to/test-docs-directory-one-doc-per-file,path/to/additional/docs,... \
* -wikipediaDumpPath path/to/enwiki-latest-pages-articles.xml
- * -outputDir path/to/output-file.ser.gz
+ * -outputPath path/to/output-file.ser.gz
*
* Required arguments:
* -datasetPaths: pointer to CoNLL-formatted files whose mentions we should extract
@@ -57,7 +56,8 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
val redirectsDB: WikipediaRedirectsDB,
val categoryDB: WikipediaCategoryDB,
val linksDB: WikipediaLinkDB,
- val auxDB: WikipediaAuxDB) extends Serializable {
+ val auxDB: WikipediaAuxDB,
+ val textDB: WikipediaTextDB) extends Serializable {
def getStandardPriorForJointModel(ment: Mention) = {
val counter = new Counter[String];
@@ -75,7 +75,9 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
def disambiguate(ment: Mention) = disambiguateBest(ment, ment.headIdx)
def disambiguateBest(ment: Mention, specifiedHeadIdx: Int) = {
- redirectsDB.followRedirect(titleGivenSurfaceDB.disambiguateQueries(Query.extractQueriesBest(ment).map(_.getFinalQueryStr)));
+ redirectsDB.followRedirect(
+ titleGivenSurfaceDB.disambiguateQueries(
+ Query.extractQueriesBest(ment).map(_.getFinalQueryStr)));
}
def disambiguateBestNoDisambig(query: Query) = {
@@ -95,19 +97,51 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
}
def disambiguateBestGetAllOptions(ment: Mention, specifiedHeadIdx: Int) = {
- auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
+ auxDB.purgeDisambiguationAll(
+ redirectsDB.followRedirectsCounter(
+ titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(
+ Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
}
def disambiguateBestGetAllOptions(query: Query) = {
- auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(query.getFinalQueryStr))));
+ auxDB.purgeDisambiguationAll(
+ redirectsDB.followRedirectsCounter(
+ titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(
+ Seq(query.getFinalQueryStr))));
}
-
+
+ def merge[T](a: Counter[T], b: Counter[T]) = {
+ for(k <- a.keySet().asScala) {
+ b.incrementCount(k, a.getCount(k))
+ }
+ }
+
+ def disambigRes(query: Query) = {
+ val str = query.getFinalQueryStr
+ var titles = titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(str))
+ titles.incrementCount(str, 1.0)
+ var redirs = redirectsDB.followRedirectsCounter(titles)
+ merge(titles, redirs)
+ //var aux = auxDB.purgeDisambiguationAll(redirs)
+ //merge(redirs, aux)
+ //aux
+ redirs
+ }
+
+
+
def disambiguateBestGetAllReasonableOptions(ment: Mention, specifiedHeadIdx: Int) = {
- auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllReasonableOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
+ auxDB.purgeDisambiguationAll(
+ redirectsDB.followRedirectsCounter(
+ titleGivenSurfaceDB.disambiguateQueriesGetAllReasonableOptions(
+ Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
}
def disambiguateBestGetAllOneBestOptions(ment: Mention, specifiedHeadIdx: Int) = {
- auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOneBestOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
+ auxDB.purgeDisambiguationAll(
+ redirectsDB.followRedirectsCounter(
+ titleGivenSurfaceDB.disambiguateQueriesGetAllOneBestOptions(
+ Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
}
def getCategories(title: String) = categoryDB.getCategories(title);
@@ -160,6 +194,8 @@ object WikipediaInterface {
val categoryDBInputPath = "";
val categoryDBOutputPath = "";
+
+ val wikiStandoff = "";
def processWikipedia(wikipediaPath: String, queries: Set[String], parser: CoarseToFineMaxRuleParser, backoffParser: CoarseToFineMaxRuleParser): WikipediaInterface = {
val titleGivenSurface = WikipediaTitleGivenSurfaceDB.processWikipedia(wikipediaPath, queries);
@@ -168,11 +204,12 @@ object WikipediaInterface {
val links = if (WikipediaInterface.computeLinkDB) {
WikipediaLinkDB.processWikipedia(wikipediaPath, allPageTargetsLc);
} else {
- new WikipediaLinkDB(new Indexer[String], new HashMap[String,Array[Int]], new HashMap[String,Array[Int]]);
+ new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]);
}
val categories = WikipediaCategoryDB.processWikipedia(wikipediaPath, allPageTargetsLc, parser, backoffParser);
val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc);
- val wi = new WikipediaInterface(titleGivenSurface, redirects, categories, links, aux);
+ val texts = WikipediaTextDB.processWikipedia(wikipediaPath, allPageTargetsLc);
+ val wi = new WikipediaInterface(titleGivenSurface, redirects, categories, links, aux, texts);
wi.printSome();
wi;
}
@@ -184,10 +221,11 @@ object WikipediaInterface {
val links = if (WikipediaInterface.computeLinkDB) {
WikipediaLinkDB.processWikipedia(wikipediaPath, allPageTargetsLc);
} else {
- new WikipediaLinkDB(new Indexer[String], new HashMap[String,Array[Int]], new HashMap[String,Array[Int]]);
+ new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]);
}
val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc);
- val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux);
+ val texts = WikipediaTextDB.processWikipedia(wikipediaPath, allPageTargetsLc);
+ val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux, texts);
wi.printSome();
wi;
}
@@ -211,22 +249,59 @@ object WikipediaInterface {
val mentionPropertyComputer = new MentionPropertyComputer(None);
val pmAssembler = CorefDocAssembler(Language.ENGLISH, useGoldMentions = false);
val gmAssembler = CorefDocAssembler(Language.ENGLISH, useGoldMentions = true);
- val corefDocs = WikipediaInterface.datasetPaths.split(",").flatMap(path => {
- if (WikipediaInterface.mentionType == "old") {
+ val corefDocs = WikipediaInterface.datasetPaths.split(",").flatMap(path_ => {
+ var path = path_
+ val mentionType = if(path.contains(":")) {
+ val s = path.split(":")
+ path = s(1)
+ s(0)
+ } else {
+ WikipediaInterface.mentionType
+ }
+ Logger.logss("Loading documents "+mentionType+" "+path)
+ if (mentionType == "old") {
// Wikification dataset: use only auto_conll and pred mentions
ConllDocReader.loadRawConllDocsWithSuffix(path, -1, "", Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer));
- } else if (WikipediaInterface.mentionType == "ace") {
+ } else if (mentionType == "ace") {
// ACE: Use gold mentions here
ConllDocReader.loadRawConllDocsWithSuffix(path, -1, "", Language.ENGLISH).map(doc => gmAssembler.createCorefDoc(doc, mentionPropertyComputer));
- } else if (WikipediaInterface.mentionType == "ontonotes") {
+ } else if (mentionType == "ontonotes") {
// OntoNotes: use only auto_conll and pred mentions
ConllDocReader.loadRawConllDocsWithSuffix(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer));
+ } else if (mentionType == "wikiser") {
+ WikiDocReader.loadRawWikiDocs(path, -1, docSuffix, Language.ENGLISH).map(doc => {
+ try {
+ gmAssembler.createCorefDoc(doc, mentionPropertyComputer)
+ } catch {
+ case e : Exception => {
+ // there are currently about 30 documents that are having an issue with their references
+ println("FAIL DOCUMENT: "+doc.docID)
+ null
+ }
+ }
+ })
} else {
throw new RuntimeException("Unrecognized mention type: " + WikipediaInterface.mentionType);
}
- });
+ }).filter(_!=null);
// val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet;
- val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr)).toSet;
+
+ // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents.
+ var queries = corefDocs.flatMap(_.predMentions/*.filter(!_.mentionType.isClosedClass)*/)
+ .flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr))
+ .toSet;
+ // some of the gold titles in the older dataset link to current redirect pages
+ // so we are loading them here so we can normalize the redirects when performing training/testing
+ val golds : Set[String] = if(!wikiStandoff.isEmpty) {
+ WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiStandoff).flatMap(d => {
+ d._2.flatMap(v => {
+ v._2.flatMap(_.label).map(_.replace("_"," "))
+ })
+ }).toSet
+ } else {
+ Set[String]()
+ }
+ queries = queries ++ golds
Logger.logss("Extracted " + queries.size + " queries from " + corefDocs.size + " documents");
val interface = if (WikipediaInterface.categoryDBInputPath != "") {
val categoryDB = GUtil.load(WikipediaInterface.categoryDBInputPath).asInstanceOf[WikipediaCategoryDB];
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
index cdcb894..d2b00cb 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
@@ -1,6 +1,7 @@
package edu.berkeley.nlp.entity.wiki
import edu.berkeley.nlp.futile.fig.basic.Indexer
+import scala.collection.mutable
import scala.collection.mutable.HashMap
import scala.collection.mutable.ArrayBuffer
import edu.berkeley.nlp.futile.fig.basic.IOUtils
@@ -14,33 +15,73 @@ import edu.berkeley.nlp.entity.lang.Language
import edu.berkeley.nlp.entity.wiki._
@SerialVersionUID(9084163557546777842L)
-class WikipediaLinkDB(val pageNameIndex: Indexer[String],
- val inLinksMap: HashMap[String,Array[Int]],
- val outLinksMap: HashMap[String,Array[Int]]) extends Serializable {
- var outLinksSetCache: HashMap[String,Set[Int]] = null;
-
+class WikipediaLinkDB(private val pageNameIndex: Indexer[String],
+ private val inLinksMap: HashMap[Int,Array[Int]],
+ private val outLinksMap: HashMap[Int,Array[Int]]) extends Serializable {
+ @transient
+ private var outLinksSetCache : mutable.HashMap[String,Set[Int]] = null
+
+ @transient
+ private var inLinksSetCache : mutable.HashMap[String,Set[Int]] = null
+
def getOutLinks(title: String) = {
- if (outLinksMap.contains(title)) {
- outLinksMap(title);
+ val k = pageNameIndex.indexOf(title)
+ if (outLinksMap.contains(k)) {
+ outLinksMap(k);
} else {
Array[Int]();
}
}
-
- def getOutLinksSetUseCache(title: String) = {
- if (outLinksMap.contains(title)) {
- if (outLinksSetCache == null) {
- outLinksSetCache = new HashMap[String,Set[Int]];
+
+ def getInLinks(title: String) = {
+ val k = pageNameIndex.indexOf(title)
+ if(inLinksMap.contains(k)) {
+ inLinksMap(k)
+ } else {
+ Array[Int]()
+ }
+ }
+
+ def getInLinksSetUseCache(title: String) : Set[Int] = {
+ if(inLinksSetCache == null) {
+ inLinksSetCache = new mutable.HashMap[String,Set[Int]]()
+ }
+ if(inLinksSetCache.contains(title)) {
+ inLinksSetCache(title)
+ } else {
+ val k = pageNameIndex.indexOf(title)
+ if(k != -1) {
+ if (inLinksSetCache.size > 1000) {
+ inLinksSetCache = new mutable.HashMap[String,Set[Int]]()
+ }
+ val s = inLinksMap.getOrElse(k, Array[Int]()).toSet
+ inLinksSetCache.put(title, s)
+ s
+ } else {
+ Set[Int]()
}
- if (!outLinksSetCache.contains(title)) {
+ }
+ }
+
+ def getOutLinksSetUseCache(title: String) : Set[Int] = {
+ if(outLinksSetCache == null) {
+ outLinksSetCache = new mutable.HashMap[String,Set[Int]]()
+ }
+ if(outLinksSetCache.contains(title)) {
+ outLinksSetCache(title)
+ } else {
+ val k = pageNameIndex.indexOf(title)
+ if(k != -1) {
if (outLinksSetCache.size > 1000) {
- outLinksSetCache.dropRight(1);
+ // dropping one item was taking too long
+ outLinksSetCache = new mutable.HashMap[String,Set[Int]]()
}
- outLinksSetCache.put(title, outLinksMap(title).toSet);
+ val s = outLinksMap.getOrElse(k, Array[Int]()).toSet
+ outLinksSetCache.put(title, s)
+ s
+ } else {
+ Set[Int]()
}
- outLinksSetCache(title);
- } else {
- Set[Int]();
}
}
@@ -56,9 +97,11 @@ class WikipediaLinkDB(val pageNameIndex: Indexer[String],
}
def doesOneLinkToOther(title1: String, title2: String): Boolean = {
+ val ti1 = pageNameIndex.indexOf(title1)
+ val ti2 = pageNameIndex.indexOf(title2)
val outLinksTitle1 = getOutLinks(title1);
val outLinksTitle2 = getOutLinks(title2);
- outLinksTitle1.contains(pageNameIndex.indexOf(title2)) || outLinksTitle2.contains(pageNameIndex.indexOf(title1))
+ outLinksTitle1.contains(ti2) || outLinksTitle2.contains(ti1)
}
}
@@ -66,18 +109,19 @@ object WikipediaLinkDB {
def processWikipedia(wikipediaPath: String, pageTitleSetLc: Set[String]): WikipediaLinkDB = {
val pageNamesIndex = new Indexer[String];
- val inLinksMap = new HashMap[String,HashSet[Int]];
- val outLinksMap = new HashMap[String,HashSet[Int]];
+ val inLinksMap = new HashMap[Int,HashSet[Int]];
+ val outLinksMap = new HashMap[Int,HashSet[Int]];
val lines = IOUtils.lineIterator(IOUtils.openInHard(wikipediaPath));
var currentPageTitle = "";
- var linksThisPage = new StringBuilder();
+ var currentPageTitleind = 0
+ //var linksThisPage = new StringBuilder();
var doneWithThisPage = false;
var numPagesSeen = 0;
var lineIdx = 0;
- var isInText = false;
- val categoryMap = new HashMap[String,ArrayBuffer[String]];
- val infoboxMap = new HashMap[String,String];
- val appositiveMap = new HashMap[String,String];
+ //var isInText = false;
+ //val categoryMap = new HashMap[String,ArrayBuffer[String]];
+ //val infoboxMap = new HashMap[String,String];
+ //val appositiveMap = new HashMap[String,String];
// Extract first line that's not in brackets
while (lines.hasNext) {
val line = lines.next;
@@ -96,10 +140,16 @@ object WikipediaLinkDB {
} else if (line.contains("