From ad98fef199b8939c04a0ce197ced5adbeb9e01b5 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sat, 21 Feb 2015 01:21:11 -0800
Subject: [PATCH 01/25] start to wiki interface, looks like sentence splitting
 is working

---
 .gitignore                                    |  8 ++
 Makefile                                      | 14 ++++
 .../edu/berkeley/nlp/entity/ConllDoc.scala    | 16 ++--
 .../berkeley/nlp/entity/ConllDocReader.scala  |  6 +-
 .../berkeley/nlp/entity/ConllDocWriter.scala  | 10 +--
 .../edu/berkeley/nlp/entity/Document.scala    | 34 ++++++++
 .../berkeley/nlp/entity/EntitySystem.scala    |  2 +-
 .../edu/berkeley/nlp/entity/WikiDoc.scala     | 34 ++++++++
 .../berkeley/nlp/entity/WikiDocReader.scala   | 77 +++++++++++++++++++
 .../nlp/entity/coref/CorefConllScorer.scala   |  8 +-
 .../berkeley/nlp/entity/coref/CorefDoc.scala  |  4 +-
 .../nlp/entity/coref/CorefDocAssembler.scala  | 34 ++++----
 .../entity/coref/CorefDocAssemblerACE.scala   |  4 +-
 .../nlp/entity/coref/CorefSystem.scala        |  2 +-
 .../berkeley/nlp/entity/coref/Mention.scala   |  6 +-
 .../berkeley/nlp/entity/joint/JointDoc.scala  |  8 +-
 .../nlp/entity/joint/JointDocACE.scala        |  6 +-
 .../nlp/entity/joint/JointPredictor.scala     |  2 +-
 .../nlp/entity/joint/JointPredictorACE.scala  |  2 +-
 .../berkeley/nlp/entity/ner/NEEvaluator.scala |  6 +-
 .../nlp/entity/ner/NESentenceMunger.scala     |  4 +-
 .../berkeley/nlp/entity/ner/NerPruner.scala   | 10 +--
 .../nlp/entity/ner/NerSystemLabeled.scala     | 15 ++--
 .../nlp/entity/preprocess/Reprocessor.scala   |  2 +-
 .../entity/preprocess/SentenceSplitter.scala  |  8 +-
 .../nlp/entity/wiki/WikipediaInterface.scala  |  7 +-
 26 files changed, 248 insertions(+), 81 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 Makefile
 create mode 100644 src/main/java/edu/berkeley/nlp/entity/Document.scala
 create mode 100644 src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
 create mode 100644 src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d7ce67f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+berkeley-entity-models.tgz
+data.tgz
+data/
+expers/
+models/
+project/project/
+project/target/
+target/
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..eab9e29
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,14 @@
+# some random useful functions
+
+TARGET = target/scala-2.11/berkeley-entity-assembly-1.jar
+
+all: $(TARGET)
+
+$(TARGET): $(wildcard src/**)
+	sbt assembly
+
+aceTester: $(TARGET)
+	java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.ACETester -dataPath data/ace05/ace05-all-conll
+
+queryModel: $(TARGET)
+	java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.QueryChooser -wikiDBPath models/wiki-db-ace.ser.gz
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala
index d29aaa0..b4012e9 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala
@@ -17,13 +17,13 @@ case class ConllDoc(val docID: String,
                     val trees: Seq[DepConstTree],
                     val nerChunks: Seq[Seq[Chunk[String]]],
                     val corefChunks: Seq[Seq[Chunk[Int]]],
-                    val speakers: Seq[Seq[String]]) {
+                    val speakers: Seq[Seq[String]]) extends Document {
   
-  val numSents = words.size;
+  override val numSents = words.size;
   
-  def uid = docID -> docPartNo;
+  override def uid = docID -> docPartNo;
   
-  def fileName = {
+  override def fileName = {
     if (docID.contains("/")) {
       docID.substring(docID.lastIndexOf("/") + 1);
     } else {
@@ -31,11 +31,11 @@ case class ConllDoc(val docID: String,
     }
   }
   
-  def printableDocName = docID + " (part " + docPartNo + ")";
+  override def printableDocName = docID + " (part " + docPartNo + ")";
   
-  def isConversation = docID.startsWith("bc") || docID.startsWith("wb");
-  
-  def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = ConllDoc.getCorrespondingNERChunk(nerChunks(sentIdx), headIdx);
+  override def isConversation = docID.startsWith("bc") || docID.startsWith("wb")
+
+  override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = ConllDoc.getCorrespondingNERChunk(nerChunks(sentIdx), headIdx);
 }
 
 object ConllDoc {
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
index 91685f3..9847abd 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
@@ -38,7 +38,7 @@ class ConllDocReader(val lang: Language,
     case _ => throw new RuntimeException("Bad language, no head finder for " + lang);
   }
   
-  def readConllDocs(fileName: String): Seq[ConllDoc] = {
+  def readConllDocs(fileName: String): Seq[Document] = {
     val fcn = (docID: String, docPartNo: Int, docBySentencesByLines: ArrayBuffer[ArrayBuffer[String]]) => assembleConllDoc(docBySentencesByLines, docID, docPartNo);
     ConllDocReader.readConllDocsGeneral(fileName, fcn);
   }
@@ -283,7 +283,7 @@ object ConllDocReader {
 //    loadRawConllDocsWithSuffix(path, size, if (gold) "gold_conll" else "auto_conll", lang, betterParsesFile);
 //  }
   
-  def loadRawConllDocsWithSuffix(path: String, size: Int, suffix: String, lang: Language = Language.ENGLISH, betterParsesFile: String = ""): Seq[ConllDoc] = {
+  def loadRawConllDocsWithSuffix(path: String, size: Int, suffix: String, lang: Language = Language.ENGLISH, betterParsesFile: String = ""): Seq[Document] = {
     Logger.logss("Loading " + size + " docs from " + path + " ending with " + suffix);
     val rawDir = new File(path);
     if (!rawDir.exists() || !rawDir.canRead() || rawDir.listFiles == null || rawDir.listFiles.isEmpty) {
@@ -292,7 +292,7 @@ object ConllDocReader {
     val rawFiles = rawDir.listFiles.sortBy(_.getAbsolutePath());
     val files = rawFiles.filter(file => file.getAbsolutePath.endsWith(suffix));
     val reader = new ConllDocReader(lang, betterParsesFile);
-    val docs = new ArrayBuffer[ConllDoc];
+    val docs = new ArrayBuffer[Document];
     var docCounter = 0;
     var fileIdx = 0;
     while (fileIdx < files.size && (size == -1 || docCounter < size)) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala
index 395a268..422a694 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala
@@ -16,7 +16,7 @@ import edu.berkeley.nlp.entity.wiki.WikiAnnotReaderWriter
 
 object ConllDocWriter {
 
-  def writeDoc(writer: PrintWriter, conllDoc: ConllDoc, clustering: OrderedClusteringBound) {
+  def writeDoc(writer: PrintWriter, conllDoc: Document, clustering: OrderedClusteringBound) {
     writeIncompleteConllDoc(writer, conllDoc.docID, conllDoc.docPartNo, conllDoc.words, conllDoc.pos, conllDoc.trees.map(_.constTree), conllDoc.speakers, conllDoc.nerChunks, convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size));
 //    val corefBits = getCorefBits(conllDoc.words.map(_.size), convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size));
 //    val numZeroesToAddToPartNo = 3 - conllDoc.docPartNo.toString.size;
@@ -35,7 +35,7 @@ object ConllDocWriter {
   }
   
   def writeDocWithPredAnnotations(writer: PrintWriter,
-                                  conllDoc: ConllDoc,
+                                  conllDoc: Document,
                                   nerChunks: Seq[Seq[Chunk[String]]],
                                   corefClustering: OrderedClusteringBound,
                                   wikiChunks: Option[Seq[Seq[Chunk[String]]]] = None) {
@@ -45,7 +45,7 @@ object ConllDocWriter {
   
   def writeDocWithPredAnnotationsWikiStandoff(writer: PrintWriter,
                                               standoffWriter: PrintWriter,
-                                              conllDoc: ConllDoc,
+                                              conllDoc: Document,
                                               nerChunks: Seq[Seq[Chunk[String]]],
                                               corefClustering: OrderedClusteringBound,
                                               wikiChunks: Seq[Seq[Chunk[String]]]) {
@@ -54,7 +54,7 @@ object ConllDocWriter {
   }
   
   def writeIncompleteConllDoc(writer: PrintWriter,
-                              doc: ConllDoc) {
+                              doc: Document) {
     writeIncompleteConllDocNestedNER(writer, doc.docID, doc.docPartNo, doc.words, doc.pos, doc.trees.map(_.constTree), doc.speakers, doc.nerChunks, doc.corefChunks);
   }
   
@@ -210,7 +210,7 @@ object ConllDocWriter {
     }
   }
   
-  def writeDocIllinoisColumnFormat(writer: PrintWriter, conllDoc: ConllDoc) {
+  def writeDocIllinoisColumnFormat(writer: PrintWriter, conllDoc: Document) {
     writer.println("O\t0\t0\tO\t-X-\t-DOCSTART-\tx\tx\t0");
     
 //    B-LOC   0       0       I-NP    NNP     Portugal        x       x       0
diff --git a/src/main/java/edu/berkeley/nlp/entity/Document.scala b/src/main/java/edu/berkeley/nlp/entity/Document.scala
new file mode 100644
index 0000000..8a2ef9d
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/Document.scala
@@ -0,0 +1,34 @@
+package edu.berkeley.nlp.entity
+
+/**
+ * Created by matthew on 2/18/15.
+ */
+trait Document {
+  def docID : String
+  def docPartNo : Int
+  // arrays of words in each sentence including punc
+  def words : Seq[Seq[String]]
+  // the gram types of the words
+  def pos : Seq[Seq[String]]
+  // parse trees of each sentence
+  def trees : Seq[DepConstTree]
+  // I am guessing the type of the chunk eg: ORG-NAM
+  def nerChunks : Seq[Seq[Chunk[String]]]
+  // have ranges and identifiers for the unique item that they are referenceing
+  // appears [start, end)
+  def corefChunks : Seq[Seq[Chunk[Int]]]
+  // just use "-" for each in the case that the speaker is unknown
+  def speakers : Seq[Seq[String]]
+
+  def numSents : Int = -1
+
+  def uid : (String, Int) = docID -> docPartNo
+
+  def fileName : String
+
+  def printableDocName : String
+
+  def isConversation : Boolean = false
+
+  def getCorrespondingNERChunk (sentIdx : Int, headIdx : Int) : Option[Chunk[String]]
+}
diff --git a/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala b/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala
index 1fad8ce..2bf9fa3 100644
--- a/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala
@@ -153,7 +153,7 @@ object EntitySystem {
        ConllDocReader.loadRawConllDocsWithSuffix(goldPath, size, goldSuffix));
     } else {
       (ConllDocReader.loadRawConllDocsWithSuffix(path, size, suffix),
-       new ArrayBuffer[ConllDoc]());
+       new ArrayBuffer[Document]());
     }
     val goldWikification = new HashMap[String,HashMap[Int,ArrayBuffer[Chunk[String]]]];
     val assembler = CorefDocAssembler(Driver.lang, Driver.useGoldMentions);
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
new file mode 100644
index 0000000..bcec448
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
@@ -0,0 +1,34 @@
+package edu.berkeley.nlp.entity
+
+/**
+ * Created by matthew on 2/18/15.
+ */
+case class WikiDoc (docID : String,
+                    docPartNo : Int,
+                    words : Seq[Seq[String]],
+                    pos : Seq[Seq[String]],
+                    trees: Seq[DepConstTree],
+                    nerChunks : Seq[Seq[Chunk[String]]],
+                    corefChunks : Seq[Seq[Chunk[Int]]],
+                    speakers : Seq[Seq[String]] ) extends Document {
+
+  override val numSents = words.size;
+
+  override def uid = docID -> docPartNo;
+
+  override def fileName = {
+    if (docID.contains("/")) {
+      docID.substring(docID.lastIndexOf("/") + 1);
+    } else {
+      docID;
+    }
+  }
+
+  override def printableDocName = docID + " (part " + docPartNo + ")";
+
+  override def isConversation = docID.startsWith("bc") || docID.startsWith("wb")
+
+  override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = None;
+
+
+}
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
new file mode 100644
index 0000000..0896864
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
@@ -0,0 +1,77 @@
+package edu.berkeley.nlp.entity
+
+import java.io.File
+
+import edu.berkeley.nlp.entity.lang.{ModCollinsHeadFinder, Language}
+import edu.berkeley.nlp.entity.preprocess.SentenceSplitter
+import edu.berkeley.nlp.futile.syntax.Tree
+
+import scala.collection.immutable.HashMap
+import scala.collection.mutable.ArrayBuffer
+import scala.xml._
+
+/**
+ * Created by matthew on 2/18/15.
+ */
+class WikiDocReader (val lang : Language, val betterParsesFile : String = "") {
+
+  val betterParses = new HashMap[ArrayBuffer[String], Tree[String]]
+
+  // TODO: betterParsesFile
+
+  val headFinder = lang match {
+    case Language.ENGLISH => new ModCollinsHeadFinder()
+    case _ => throw new RuntimeException()
+  }
+
+  val sentenceSplitter = SentenceSplitter.loadSentenceSplitter("models/sentsplit.txt.gz")
+
+  def readWikiDocs(fileName : String) : Seq[WikiDoc] = {
+    val referencesFile = fileName.replace("RawTexts", "Problems");
+    val refxml = XML.loadFile(referencesFile);
+    val document = scala.io.Source.fromFile(fileName).mkString
+
+    //val splits = sentenceSplitter.formCanonicalizedParagraphs(document.split(" "), false, false)
+    val splits  = sentenceSplitter.splitSentences(document.split("\n").filter(!_.trim.isEmpty))
+
+    
+
+    for(reference <- refxml \ "ReferenceInstance") {
+      val surfaceForm = (reference \ "SurfaceForm")(0).text.trim
+      val offset = (reference \ "offset")(0).text.trim.toInt
+      val length = (reference \ "length")(0).text.trim.toInt
+      val chosenAnnotation = (reference \ "ChosenAnnotation")(0).text.trim
+      val annotatorId = (reference \ "AnnotatorId")(0).text.trim
+      val annotation = (reference \ "Annotation")(0).text.trim
+
+
+    }
+
+    // docID some unique identifier, filename
+    // partNo some int cnt
+    // words an array of sentences
+    // trees set of parse trees for a given sentence entity.DepConstTree
+    // nerchunks entity.Chunk
+
+
+    Seq[WikiDoc]()
+  }
+
+}
+
+object WikiDocReader {
+  def loadRawWikiDocs(path : String, size : Int, suffix : String, lang : Language = Language.ENGLISH, betterParsesFile : String = "") : Seq[Document] = {
+    val rawDir = new File(path)
+    if (!rawDir.exists() || !rawDir.canRead() || rawDir.listFiles == null || rawDir.listFiles.isEmpty) {
+      throw new RuntimeException("Couldn't find directory " + path);
+    }
+    var rawFiles = rawDir.listFiles.map(_.getAbsolutePath())
+    //val files = rawFiles.filter(file => file.getAbsolutePath.endsWith(suffix));
+    val reader = new WikiDocReader(lang, betterParsesFile)
+    val docs = new ArrayBuffer[Document]
+    for(fname <- rawFiles) {
+      docs ++= reader.readWikiDocs(fname)
+    }
+    docs
+  }
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala
index bfd8b14..ee9b457 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala
@@ -9,22 +9,22 @@ import scala.sys.process.stringSeqToProcess
 import scala.sys.process.Process
 import edu.berkeley.nlp.futile.util.Logger
 import edu.berkeley.nlp.entity.Driver
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.ConllDocWriter
 
 class CorefConllScorer(val conllEvalScriptPath: String) {
   
-  def renderFinalScore(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound]) = {
+  def renderFinalScore(conllDocs: Seq[Document], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound]) = {
     val summary = score(conllDocs, rawPredClusterings, goldClusterings, true);
     CorefConllScorer.processConllString(summary, false);
   }
   
-  def renderSuffStats(conllDoc: ConllDoc, rawPredClustering: OrderedClusteringBound, goldClustering: OrderedClusteringBound) = {
+  def renderSuffStats(conllDoc: Document, rawPredClustering: OrderedClusteringBound, goldClustering: OrderedClusteringBound) = {
     val summary = score(Seq(conllDoc), Seq(rawPredClustering), Seq(goldClustering), false);
     CorefConllScorer.processConllString(summary, true);
   }
   
-  def score(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound], saveTempFiles: Boolean) = {
+  def score(conllDocs: Seq[Document], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound], saveTempFiles: Boolean) = {
     val predClusterings = if (Driver.doConllPostprocessing) rawPredClusterings.map(_.postprocessForConll()) else rawPredClusterings;
 //    var predFile = File.createTempFile("temp", ".conll");
     val (predFile, goldFile) = if (Driver.conllOutputDir != "" && saveTempFiles) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala
index f7cc4b6..f5634fb 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala
@@ -10,9 +10,9 @@ import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
 import edu.berkeley.nlp.futile.util.Counter
 import edu.berkeley.nlp.futile.util.Logger
 import edu.berkeley.nlp.entity.GUtil
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 
-case class CorefDoc(val rawDoc: ConllDoc,
+case class CorefDoc(val rawDoc: Document,
                     val goldMentions: Seq[Mention],
                     val goldClustering: OrderedClustering,
                     val predMentions: Seq[Mention]) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala
index 9c369e3..413e1cd 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala
@@ -12,17 +12,17 @@ import edu.berkeley.nlp.entity.lang.ChineseCorefLanguagePack
 import edu.berkeley.nlp.entity.lang.ArabicCorefLanguagePack
 import edu.berkeley.nlp.futile.util.Counter
 import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 
 case class ProtoMention(val sentIdx: Int, val startIdx: Int, val endIdx: Int, val headIdx: Int);
 case class ProtoMentionFancy(val sentIdx: Int, val startIdx: Int, val endIdx: Int, val headIndices: Seq[Int]);
 
-case class ProtoCorefDoc(val doc: ConllDoc, val goldMentions: Seq[Mention], val predProtoMentions: Seq[ProtoMention]);
+case class ProtoCorefDoc(val doc: Document, val goldMentions: Seq[Mention], val predProtoMentions: Seq[ProtoMention]);
 
 class CorefDocAssembler(val langPack: CorefLanguagePack,
                         val useGoldMentions: Boolean) {
   
-  def createCorefDoc(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = {
+  def createCorefDoc(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = {
     val (goldMentions, goldClustering) = extractGoldMentions(rawDoc, propertyComputer);
     if (goldMentions.size == 0) {
       Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
@@ -31,7 +31,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions)
   }
   
-  def createCorefDocFancy(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, possibleChunks: Seq[Seq[Chunk[Boolean]]]): CorefDoc = {
+  def createCorefDocFancy(rawDoc: Document, propertyComputer: MentionPropertyComputer, possibleChunks: Seq[Seq[Chunk[Boolean]]]): CorefDoc = {
     val (goldMentions, goldClustering) = extractGoldMentions(rawDoc, propertyComputer);
     if (goldMentions.size == 0) {
       Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
@@ -41,11 +41,11 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions)
   }
   
-  def extractGoldMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
+  def extractGoldMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
     CorefDocAssembler.extractGoldMentions(rawDoc, propertyComputer, langPack);
   }
   
-  def extractPredMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
+  def extractPredMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
     val protoMentionsSorted = getProtoMentionsSorted(rawDoc, gms);
     val finalMentions = new ArrayBuffer[Mention]();
     for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) {
@@ -54,7 +54,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     finalMentions;
   }
   
-  def extractPredMentionsFancy(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Mention] = {
+  def extractPredMentionsFancy(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Mention] = {
     val protoMentionsSorted = getProtoMentionsSortedFancy(rawDoc, gms, possibleChunks);
     val finalMentions = new ArrayBuffer[Mention]();
     for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) {
@@ -63,7 +63,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     finalMentions;
   }
   
-  private def getProtoMentionsSorted(rawDoc: ConllDoc, gms: Seq[Mention]): Seq[Seq[ProtoMention]] = {
+  private def getProtoMentionsSorted(rawDoc: Document, gms: Seq[Mention]): Seq[Seq[ProtoMention]] = {
     val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMention]);
     for (sentIdx <- 0 until rawDoc.numSents) {
       // Extract NE spans: filter out O, QUANTITY, CARDINAL, CHUNK
@@ -131,7 +131,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
 //    }
   }
   
-  private def getProtoMentionsSortedFancy(rawDoc: ConllDoc, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Seq[ProtoMention]] = {
+  private def getProtoMentionsSortedFancy(rawDoc: Document, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Seq[ProtoMention]] = {
     val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMention]);
     for (sentIdx <- 0 until rawDoc.numSents) {
       // Extract NPs and PRPs *except* for those contained in NE chunks (the NE tagger seems more reliable than the parser)
@@ -154,7 +154,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     }
   }
   
-  private def filterNonMaximalNPs(rawDoc: ConllDoc, mentionExtents: Seq[HashSet[ProtoMention]]) = {
+  private def filterNonMaximalNPs(rawDoc: Document, mentionExtents: Seq[HashSet[ProtoMention]]) = {
     val filteredProtoMentionsSorted = (0 until rawDoc.numSents).map(i => new ArrayBuffer[ProtoMention]);
     for (sentIdx <- 0 until mentionExtents.size) {
       val protoMentionsByHead = mentionExtents(sentIdx).groupBy(_.headIdx);
@@ -211,7 +211,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
   //////////////////
   
   
-  def createCorefDocWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = {
+  def createCorefDocWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = {
     val (goldMentions, goldClustering) = extractGoldMentionsWithCoordination(rawDoc, propertyComputer);
     if (goldMentions.size == 0) {
       Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
@@ -220,7 +220,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions)
   }
   
-  def extractGoldMentionsWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
+  def extractGoldMentionsWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = {
     val goldProtoMentionsSorted = getGoldProtoMentionsSortedWithCoordination(rawDoc);
     val finalMentions = new ArrayBuffer[Mention]();
     val goldClusterLabels = new ArrayBuffer[Int]();
@@ -238,7 +238,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     (finalMentions, OrderedClustering.createFromClusterIds(goldClusterLabels));
   }
   
-  def extractPredMentionsWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
+  def extractPredMentionsWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = {
     val protoMentionsSorted = getProtoMentionsSortedWithCoordination(rawDoc, gms);
     val finalMentions = new ArrayBuffer[Mention]();
     for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) {
@@ -247,7 +247,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     finalMentions;
   }
   
-  private def getGoldProtoMentionsSortedWithCoordination(rawDoc: ConllDoc): Seq[Seq[ProtoMentionFancy]] = {
+  private def getGoldProtoMentionsSortedWithCoordination(rawDoc: Document): Seq[Seq[ProtoMentionFancy]] = {
     val goldProtoMentions = for (sentIdx <- 0 until rawDoc.corefChunks.size) yield {
        for (chunk <- rawDoc.corefChunks(sentIdx)) yield {
          val headIndices = rawDoc.trees(sentIdx).getSpanHeadOrNPCoordinatedHeads(chunk.start, chunk.end);
@@ -257,7 +257,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack,
     goldProtoMentions.map(_.sortBy(ment => (ment.sentIdx, ment.headIndices.head, ment.endIdx, ment.startIdx)));
   }
   
-  private def getProtoMentionsSortedWithCoordination(rawDoc: ConllDoc, gms: Seq[Mention]): Seq[Seq[ProtoMentionFancy]] = {
+  private def getProtoMentionsSortedWithCoordination(rawDoc: Document, gms: Seq[Mention]): Seq[Seq[ProtoMentionFancy]] = {
     val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMentionFancy]);
     for (sentIdx <- 0 until rawDoc.numSents) {
       // Extract NE spans: filter out O, QUANTITY, CARDINAL, CHUNK
@@ -442,7 +442,7 @@ object CorefDocAssembler {
     new CorefDocAssembler(langPack, useGoldMentions);
   }
   
-  def extractGoldMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, langPack: CorefLanguagePack): (Seq[Mention], OrderedClustering) = {
+  def extractGoldMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer, langPack: CorefLanguagePack): (Seq[Mention], OrderedClustering) = {
     val goldProtoMentionsSorted = getGoldProtoMentionsSorted(rawDoc);
     val finalMentions = new ArrayBuffer[Mention]();
     val goldClusterLabels = new ArrayBuffer[Int]();
@@ -460,7 +460,7 @@ object CorefDocAssembler {
     (finalMentions, OrderedClustering.createFromClusterIds(goldClusterLabels));
   }
   
-  def getGoldProtoMentionsSorted(rawDoc: ConllDoc): Seq[Seq[ProtoMention]] = {
+  def getGoldProtoMentionsSorted(rawDoc: Document): Seq[Seq[ProtoMention]] = {
     val goldProtoMentions = for (sentIdx <- 0 until rawDoc.corefChunks.size) yield {
        for (chunk <- rawDoc.corefChunks(sentIdx)) yield {
          val headIdx = rawDoc.trees(sentIdx).getSpanHead(chunk.start, chunk.end);
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala
index cacd259..41a80e3 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala
@@ -5,13 +5,13 @@ import edu.berkeley.nlp.futile.util.Logger
 import scala.collection.mutable.ArrayBuffer
 import edu.berkeley.nlp.entity.wiki.ACEMunger
 import java.io.File
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 
 class CorefDocAssemblerACE(dirPath: String) {
   
   val langPack = new EnglishCorefLanguagePack()
 
-  def createCorefDoc(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = {
+  def createCorefDoc(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = {
     val (goldMentions, goldClustering) = CorefDocAssembler.extractGoldMentions(rawDoc, propertyComputer, langPack);
     if (goldMentions.size == 0) {
       Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName);
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala
index 208c342..85adc64 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala
@@ -39,7 +39,7 @@ import edu.berkeley.nlp.entity.xdistrib.DocumentGraphComponents
 import edu.berkeley.nlp.futile.fig.exec.Execution
 import edu.berkeley.nlp.entity.Driver
 import edu.berkeley.nlp.entity.GUtil
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.WordNetInterfacer
 import edu.berkeley.nlp.entity.ConllDocWriter
 import edu.berkeley.nlp.entity.ConllDocReader
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
index 58b9cd4..8069292 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
@@ -8,14 +8,14 @@ import edu.berkeley.nlp.entity.sem.SemClasser
 import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
 import edu.berkeley.nlp.futile.util.Counter
 import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.Driver;
 import edu.berkeley.nlp.entity.WordNetInterfacer
 
 // TODO: Extract an interface for ConllDoc so I don't have to keep the whole
 // document around...but while I'm feature engineering it's useful to be able
 // to put my hands on anything I want
-class Mention(val rawDoc: ConllDoc,
+class Mention(val rawDoc: Document,
               val mentIdx: Int,
               val sentIdx: Int,
               val startIdx: Int,
@@ -247,7 +247,7 @@ object Mention {
   val StartPosPlaceholder = "<S>";
   val EndPosPlaceholder = "</S>";
   
-  def createMentionComputeProperties(rawDoc: ConllDoc,
+  def createMentionComputeProperties(rawDoc: Document,
                                      mentIdx: Int,
                                      sentIdx: Int,
                                      startIdx: Int,
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala
index 512cc27..a78e96f 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala
@@ -15,10 +15,10 @@ import edu.berkeley.nlp.entity.Driver
 import edu.berkeley.nlp.entity.ner.NerFeaturizer
 import scala.collection.mutable.HashSet
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.ner.NerPruner
 
-class JointDoc(val rawDoc: ConllDoc,
+class JointDoc(val rawDoc: Document,
                val docGraph: DocumentGraph,
                val goldNERChunks: Seq[Seq[Chunk[String]]],
                val goldWikiChunks: Seq[Seq[Chunk[String]]]) {
@@ -71,7 +71,7 @@ class JointDoc(val rawDoc: ConllDoc,
 
 object JointDoc {
   
-  def apply(rawDoc: ConllDoc,
+  def apply(rawDoc: Document,
             docGraph: DocumentGraph,
             maybeGoldNERChunks: Option[Seq[Seq[Chunk[String]]]],
             maybeGoldWikiChunks: Option[Seq[Seq[Chunk[String]]]]) = {
@@ -89,7 +89,7 @@ object JointDoc {
   }
   
   def assembleJointDocs(docGraphs: Seq[DocumentGraph],
-                        goldConllDocsForNER: Seq[ConllDoc],
+                        goldConllDocsForNER: Seq[Document],
                         goldWikification: HashMap[String,HashMap[Int,ArrayBuffer[Chunk[String]]]]) = {
     docGraphs.map(docGraph => {
       val rawDoc = docGraph.corefDoc.rawDoc;
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala
index fc78b5e..85c9683 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala
@@ -5,13 +5,13 @@ import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 
 import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.coref.DocumentGraph
 import edu.berkeley.nlp.entity.coref.Mention
 import edu.berkeley.nlp.entity.wiki._
 import edu.berkeley.nlp.futile.util.Logger
 
-class JointDocACE(val rawDoc: ConllDoc,
+class JointDocACE(val rawDoc: Document,
                   val docGraph: DocumentGraph,
                   val goldWikiChunks: Seq[Seq[Chunk[Seq[String]]]]) {
   
@@ -36,7 +36,7 @@ class JointDocACE(val rawDoc: ConllDoc,
 
 object JointDocACE {
   
-  def apply(rawDoc: ConllDoc,
+  def apply(rawDoc: Document,
             docGraph: DocumentGraph,
             maybeGoldWikiChunks: Option[Seq[Seq[Chunk[Seq[String]]]]]) = {
     val goldWikiChunks = if (maybeGoldWikiChunks.isDefined) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala
index 667672b..afeb3f7 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala
@@ -3,7 +3,7 @@ package edu.berkeley.nlp.entity.joint
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.ConllDocReader
 import edu.berkeley.nlp.entity.ConllDocWriter
 import edu.berkeley.nlp.entity.GUtil
diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala
index 71e9274..cf93562 100644
--- a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala
@@ -12,7 +12,7 @@ import edu.berkeley.nlp.entity.coref.CorefDocAssembler
 import scala.collection.mutable.HashMap
 import scala.collection.mutable.ArrayBuffer
 import edu.berkeley.nlp.entity.Chunk
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.coref.DocumentGraph
 import edu.berkeley.nlp.futile.fig.exec.Execution
 import edu.berkeley.nlp.entity.coref.CorefEvaluator
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala
index a0f4c96..0627b42 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala
@@ -1,6 +1,6 @@
 package edu.berkeley.nlp.entity.ner
 
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.coref.Mention
 import edu.berkeley.nlp.futile.util.Logger
 import edu.berkeley.nlp.entity.coref.CorefSystem
@@ -53,11 +53,11 @@ object NEEvaluator {
     }));
   }
   
-  def evaluate(goldDocs: Seq[ConllDoc], predDocs: Seq[ConllDoc]) {
+  def evaluate(goldDocs: Seq[Document], predDocs: Seq[Document]) {
     evaluateChunks(goldDocs, predDocs.map(_.nerChunks));
   }
 
-  def evaluateChunks(goldDocs: Seq[ConllDoc], allPredChunks: Seq[Seq[Seq[Chunk[String]]]]) {
+  def evaluateChunks(goldDocs: Seq[Document], allPredChunks: Seq[Seq[Seq[Chunk[String]]]]) {
     var correct = 0;
     val correctByLabel = new Counter[String];
     var correctSameHead = 0;
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala
index fd9cd40..911ba9c 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala
@@ -2,13 +2,13 @@ package edu.berkeley.nlp.entity.ner
 
 import edu.berkeley.nlp.entity.ConllDocReader
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer
 import edu.berkeley.nlp.futile.fig.basic.IOUtils
 
 object NESentenceMunger {
   
-  def writeSentences(file: String, docs: Seq[ConllDoc]) {
+  def writeSentences(file: String, docs: Seq[Document]) {
     val out = IOUtils.openOutHard(file);
     for (doc <- docs; words <- doc.words) {
       out.println(words.foldLeft("")(_ + " " + _).trim);
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala
index 1b7a40f..e73e7c2 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala
@@ -2,7 +2,7 @@ package edu.berkeley.nlp.entity.ner
 
 import scala.collection.mutable.HashMap
 import edu.berkeley.nlp.entity.coref.UID
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.futile.fig.basic.Indexer
 import edu.berkeley.nlp.entity.Driver
@@ -10,14 +10,14 @@ import edu.berkeley.nlp.futile.util.Logger
 
 trait NerPruner {
 
-  def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]];
+  def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]];
 }
 
 @SerialVersionUID(1L)
 class NerPrunerFromModel(val nerModel: NerSystemLabeled,
                          val pruningThreshold: Double) extends NerPruner with Serializable {
   
-  def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = {
+  def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = {
     val sentMarginals = nerModel.computeLogMarginals(doc.words(sentIdx).toArray, doc.pos(sentIdx).toArray);
     NerPruner.pruneFromMarginals(sentMarginals, nerModel.labelIndexer, pruningThreshold);
   }
@@ -28,7 +28,7 @@ class NerPrunerFromMarginals(val nerMarginals: HashMap[UID,Seq[Array[Array[Float
                              val neLabelIndexer: Indexer[String],
                              val pruningThreshold: Double) extends NerPruner with Serializable  {
   
-  def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = {
+  def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = {
     require(nerMarginals.contains(doc.uid), "Doc ID " + doc.uid + " doesn't have precomputed NER marginals" +
             " and the NER pruner in this model is configured to rely on these. You need to either change" +
             " how you specify the pruner (if training) or use a different model entirely (if testing)");
@@ -42,7 +42,7 @@ class NerPrunerFromMarginalsAndModel(val nerMarginals: HashMap[UID,Seq[Array[Arr
                                      val nerModel: NerSystemLabeled,
                                      val pruningThreshold: Double) extends NerPruner with Serializable {
   
-  def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = {
+  def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = {
     val sentMarginals = if (nerMarginals.contains(doc.uid)) {
       nerMarginals(doc.uid)(sentIdx)
     } else {
diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala
index 2d1bb7a..7cf1b43 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala
@@ -2,11 +2,10 @@ package edu.berkeley.nlp.entity.ner
 import edu.berkeley.nlp.futile.fig.basic.Indexer
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.JavaConverters._
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity._
 import edu.berkeley.nlp.futile.classify.GeneralLogisticRegression
 import edu.berkeley.nlp.entity.coref.CorefSystem
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.futile.classify.SequenceExample
 import edu.berkeley.nlp.futile.fig.basic.IOUtils
 import java.io.FileInputStream
@@ -15,12 +14,9 @@ import java.io.File
 import java.io.FileOutputStream
 import java.io.ObjectOutputStream
 import edu.berkeley.nlp.futile.util.Counter
-import edu.berkeley.nlp.entity.Chunk
 import scala.collection.mutable.HashMap
-import edu.berkeley.nlp.entity.ConllDocReader
 import edu.berkeley.nlp.entity.lang.Language
 import scala.util.Random
-import edu.berkeley.nlp.entity.ConllDocWriter
 import edu.berkeley.nlp.math.SloppyMath
 import edu.berkeley.nlp.entity.wiki.WikipediaInterface
 import edu.berkeley.nlp.entity.coref.UID
@@ -194,7 +190,8 @@ object NerSystemLabeled {
 //    transitionMatrix.map(_.map(arr => if (arr != null) arr.map(featureIndexer.getIndex(_)) else null));
 //  }
   
-  def replaceNer(doc: ConllDoc, newChunks: Seq[Seq[Chunk[String]]]) = {
+  def replaceNer(doc: Document, newChunks: Seq[Seq[Chunk[String]]]) = {
+    // MFL TODO: ?? need to make it work either way?
     new ConllDoc(doc.docID, doc.docPartNo, doc.words, doc.pos, doc.trees, newChunks, doc.corefChunks, doc.speakers);
   }
   
@@ -227,7 +224,7 @@ object NerSystemLabeled {
   
   // TRAINING
   
-  def trainNerSystem(trainDocs: Seq[ConllDoc],
+  def trainNerSystem(trainDocs: Seq[Document],
                      maybeBrownClusters: Option[Map[String,String]],
                      nerFeatureSet: Set[String],
                      reg: Double,
@@ -267,7 +264,7 @@ object NerSystemLabeled {
   
   // EVALUATION
   
-  def evaluateNerSystem(nerSystem: NerSystemLabeled, testDocs: Seq[ConllDoc]) {
+  def evaluateNerSystem(nerSystem: NerSystemLabeled, testDocs: Seq[Document]) {
     val labelIndexer = nerSystem.labelIndexer;
     Logger.logss("Extracting test examples");
     val testExamples = extractNerChunksFromConll(testDocs);
@@ -332,7 +329,7 @@ object NerSystemLabeled {
     }
   }
   
-  def extractNerChunksFromConll(docs: Seq[ConllDoc]): Seq[NerExample] = {
+  def extractNerChunksFromConll(docs: Seq[Document]): Seq[NerExample] = {
     val chunkTypeCounts = new Counter[String];
     val examples = docs.flatMap(doc => {
       val chunksToUse = doc.nerChunks
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala
index 19ac409..9e8ee9e 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala
@@ -1,7 +1,7 @@
 package edu.berkeley.nlp.entity.preprocess
 
 import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import java.io.PrintWriter
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala
index 8ac70d1..85c7a97 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala
@@ -1,5 +1,5 @@
 package edu.berkeley.nlp.entity.preprocess
-import edu.berkeley.nlp.entity.ConllDoc
+import edu.berkeley.nlp.entity.Document
 import edu.berkeley.nlp.entity.coref.CorefSystem
 import scala.io.Source
 import scala.collection.mutable.ArrayBuffer
@@ -99,8 +99,8 @@ object SentenceSplitter {
     
     def featurize(featureIndexer: Indexer[String], addToIndexer: Boolean): Array[Int] = {
       val featStrs = new ArrayBuffer[String];
-      val pw = prevWord;
-      val fw = followingWord;
+      val pw = if(prevWord.isEmpty) " " else prevWord
+      val fw = if (followingWord.isEmpty) " " else followingWord
       val fwcls = (if (Character.isUpperCase(fw.charAt(0))) "UC" else if (Character.isLowerCase(fw.charAt(0))) "LC" else if (!Character.isLetterOrDigit(fw.charAt(0))) "PU" else "OTHER");
       featStrs += ("Bias=1");
       featStrs += ("LastChar=" + pw.last);
@@ -242,7 +242,7 @@ object SentenceSplitter {
   }
   
   
-  private def readExamplesFromConll(docs: Seq[ConllDoc]): Seq[SplitExample] = {
+  private def readExamplesFromConll(docs: Seq[Document]): Seq[SplitExample] = {
     // N.B. we only loop up until size - 1 since the end of the last sentence
     // has no following context and isn't a good training example.
     // We extract pretty much all positives except for really weird stuff.
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index 88f9ff2..69fd469 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -5,9 +5,8 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
-import edu.berkeley.nlp.entity.ConllDocReader
+import edu.berkeley.nlp.entity.{WikiDocReader, ConllDocReader, GUtil}
 import edu.berkeley.nlp.entity.coref.CorefDocAssembler
-import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.entity.coref.Mention
 import edu.berkeley.nlp.entity.coref.MentionPropertyComputer
 import edu.berkeley.nlp.entity.lang.Language
@@ -221,11 +220,15 @@ object WikipediaInterface {
       } else if (WikipediaInterface.mentionType == "ontonotes") {
         // OntoNotes: use only auto_conll and pred mentions
         ConllDocReader.loadRawConllDocsWithSuffix(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer));
+      } else if (WikipediaInterface.mentionType == "wikipedia") {
+        WikiDocReader.loadRawWikiDocs(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer))
       } else {
         throw new RuntimeException("Unrecognized mention type: " + WikipediaInterface.mentionType);
       }
     });
 //    val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet;
+
+    // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents.
     val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr)).toSet;
     Logger.logss("Extracted " + queries.size + " queries from " + corefDocs.size + " documents");
     val interface = if (WikipediaInterface.categoryDBInputPath != "") {

From 57c24808623a96f650507cd7a32732960e38cf94 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sat, 21 Feb 2015 20:53:21 -0800
Subject: [PATCH 02/25] trying to build wiki to conll interface similar to the
 existing raw text processor

---
 .gitignore                                    |   3 +-
 .../berkeley/nlp/entity/WikiDocReader.scala   |   6 +-
 .../preprocess/PreprocessingDriver.java       |   6 +-
 .../entity/preprocess/WikiPreprocessor.scala  | 167 ++++++++++++++++++
 4 files changed, 177 insertions(+), 5 deletions(-)
 create mode 100644 src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala

diff --git a/.gitignore b/.gitignore
index d7ce67f..99fe3d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,5 @@ expers/
 models/
 project/project/
 project/target/
-target/
\ No newline at end of file
+target/
+specify_execDir/
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
index 0896864..8865ae4 100644
--- a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
@@ -34,12 +34,12 @@ class WikiDocReader (val lang : Language, val betterParsesFile : String = "") {
     //val splits = sentenceSplitter.formCanonicalizedParagraphs(document.split(" "), false, false)
     val splits  = sentenceSplitter.splitSentences(document.split("\n").filter(!_.trim.isEmpty))
 
-    
+
 
     for(reference <- refxml \ "ReferenceInstance") {
       val surfaceForm = (reference \ "SurfaceForm")(0).text.trim
-      val offset = (reference \ "offset")(0).text.trim.toInt
-      val length = (reference \ "length")(0).text.trim.toInt
+      val offset = (reference \ "Offset")(0).text.trim.toInt
+      val length = (reference \ "Length")(0).text.trim.toInt
       val chosenAnnotation = (reference \ "ChosenAnnotation")(0).text.trim
       val annotatorId = (reference \ "AnnotatorId")(0).text.trim
       val annotation = (reference \ "Annotation")(0).text.trim
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java b/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java
index 1d3a0d7..78fba09 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java
@@ -12,6 +12,7 @@
 import edu.berkeley.nlp.PCFGLA.TreeAnnotations;
 import edu.berkeley.nlp.entity.ConllDocJustWords;
 import edu.berkeley.nlp.entity.ConllDocReader;
+import edu.berkeley.nlp.entity.WikiDocReader;
 import edu.berkeley.nlp.entity.lang.Language;
 import edu.berkeley.nlp.entity.ner.NerSystemLabeled;
 import edu.berkeley.nlp.futile.fig.basic.IOUtils;
@@ -92,7 +93,7 @@ public class PreprocessingDriver implements Runnable {
   public static boolean useAlternateTokenizer = false;
   
   public static enum Mode {
-    RAW_TEXT, CONLL_JUST_WORDS, REDO_CONLL;
+    RAW_TEXT, CONLL_JUST_WORDS, REDO_CONLL, WIKILIMITED;
   }
   
   public static void main(String[] args) {
@@ -128,6 +129,9 @@ public void run() {
           Logger.logss("Processed document " + docName + " and wrote result to " + outputDir);
         }
         writer.close();
+      } else if (mode == Mode.WIKILIMITED) {
+          WikiDocReader docReader = new WikiDocReader(Language.ENGLISH, "");
+          WikiPreprocessor.processesDocs(inputDir + "/", outputDir + "/", docReader, splitter, parser, backoffParser, nerSystem);
       } else {
         ConllDocReader docReader = new ConllDocReader(Language.ENGLISH, "");
         for (File inputFile : new File(inputDir).listFiles()) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
new file mode 100644
index 0000000..e6ed112
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -0,0 +1,167 @@
+package edu.berkeley.nlp.entity.preprocess
+
+import java.io.File
+
+import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
+import edu.berkeley.nlp.entity.{Chunk, WikiDocReader}
+import edu.berkeley.nlp.entity.ner.NerSystemLabeled
+import edu.berkeley.nlp.futile.util.Logger
+import edu.berkeley.nlp.syntax.Tree
+
+import scala.xml._
+import scala.concurrent._
+import scala.collection.JavaConverters._
+
+import ExecutionContext.Implicits.global
+
+/**
+ * Created by matthew on 2/21/15.
+ */
+object WikiPreprocessor {
+
+  def processesDocs (inputDir : String, outputDir : String,
+                     docReader : WikiDocReader,
+                     splitter : SentenceSplitter,
+                     parser : CoarseToFineMaxRuleParser,
+                     backoffParser : CoarseToFineMaxRuleParser,
+                     nerSystem : NerSystemLabeled) = {
+    new File(inputDir).listFiles.map(file => {
+      val input_file = file.getAbsolutePath
+      val output_file = outputDir + file.getName
+      //Future {
+        process(input_file, output_file, docReader, splitter, parser, backoffParser, nerSystem)
+      //}
+    })//.foreach(Await.result(_, duration.Duration.Inf))
+  }
+
+  def process(inputFile : String, outputFile : String,
+              docReader : WikiDocReader,
+              splitter : SentenceSplitter,
+              parser : CoarseToFineMaxRuleParser,
+              backoffParser : CoarseToFineMaxRuleParser,
+              nerSystem : NerSystemLabeled) = {
+    /*String docName = inputPath;
+    String[] lines = IOUtils.readLinesHard(inputPath).toArray(new String[0]);
+    String[] canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(lines, respectInputLineBreaks, respectInputTwoLineBreaks);
+    String[] sentences = null;
+    if (skipSentenceSplitting) {
+      sentences = canonicalizedParagraphs;
+    } else {
+      sentences = splitter.splitSentences(canonicalizedParagraphs);
+    }
+    String[][] tokenizedSentences = (useAlternateTokenizer ? splitter.tokenizeAlternate(sentences) : splitter.tokenize(sentences));
+    Logger.logss("Document " + docName + " contains " + lines.length + " lines and " + tokenizedSentences.length + " sentences");
+    String[][] docConllLines = renderDocConllLines(docName, tokenizedSentences, parser, backoffParser, nerSystem);
+    writeConllLines(docName, docConllLines, outputPath);
+*/
+
+     /*
+        String[][] conllLines = new String[tokenizedSentences.length][];
+    for (int sentIdx = 0; sentIdx < tokenizedSentences.length; sentIdx++) {
+      String[] tokenizedSentence = tokenizedSentences[sentIdx];
+      Tree<String> parse = parse(parser, backoffParser, Arrays.asList(tokenizedSentence));
+      if (parse.getYield().size() != tokenizedSentence.length) {
+        Logger.logss("WARNING: couldn't parse sentence, dropping it: " + Arrays.toString(tokenizedSentence));
+        Logger.logss("  (This will be fixed to backing off to an X-bar grammar in a future release)");
+      } else {
+        String[] posTags = new String[tokenizedSentence.length];
+        List<String> preterminals = parse.getPreTerminalYield();
+        for (int i = 0; i < preterminals.size(); i++) {
+          posTags[i] = preterminals.get(i);
+        }
+        String[] nerBioLabels = null;
+        if (nerSystem != null) {
+          nerBioLabels = nerSystem.tagBIO(tokenizedSentence, posTags);
+        } else {
+          nerBioLabels = new String[tokenizedSentence.length];
+          Arrays.fill(nerBioLabels, "O");
+        }
+        conllLines[sentIdx] = renderSentenceConllLines(docName, 0, tokenizedSentence, posTags, parse, nerBioLabels);
+      }
+    }
+    return conllLines;
+
+    */
+
+    Logger.logss("starting processing of " + inputFile)
+    val referencesFile = inputFile.replace("RawTexts", "Problems")
+    val refxml = XML.loadFile(referencesFile)
+    val document = scala.io.Source.fromFile(inputFile).mkString.split("\n")
+
+    val references = (refxml \ "ReferenceInstance").map(r => (
+      (r \ "SurfaceForm")(0).text.trim,
+      (r \ "Offset")(0).text.trim.toInt,
+      (r \ "Length")(0).text.trim.toInt,
+      (r \ "ChosenAnnotation")(0).text.trim,
+      (r \ "AnnotatorId")(0).text.trim,
+      (r \ "Annotation")(0).text.trim
+      ))
+
+    val canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(document, false, false)
+    val sentences = splitter.splitSentences(canonicalizedParagraphs)
+    val tokens = SentenceSplitter.tokenize(sentences)
+
+
+    val doclenratio = sentences.map(_.size).sum.toFloat / document.map(_.size + 1).sum
+    def refFinder (ref : (String, Int, Int, String, String, String)) : (Int, Chunk[String]) = {
+      val d = doclenratio * (ref._2 + ref._3 / 2.0)
+      var cnt = 0
+      val wrds = ref._1.replace(" ", "")
+      def rank_match(i : Int, j : Int) : Double = {
+        val res = tokens(i).drop(j).reduce(_+_)
+        for(q <- 0 until Math.min(wrds.size, res.size)) {
+          if (res(q) != wrds(q))
+            return q.toDouble / wrds.size
+        }
+        1.0
+      }
+      for(i <- 0 to sentences.size) {
+        cnt += sentences(i).size
+        if(cnt > d) {
+          // assume that the reference is in this sentence
+          var ll = cnt - sentences(i).size + d // estimated place in sentence
+          var tcnt = 0
+          var best_start = 0
+          var best_rank = Double.NegativeInfinity
+
+          for(j <- 0 until tokens(i).size) {
+            val r = rank_match(i,j) * Math.abs(ll - tcnt) // try and make the item close to where it should be
+            if(r > best_rank) {
+              best_start = j
+              best_rank = r
+            }
+            tcnt += tokens(i)(j).size
+          }
+          var len = 0
+          var len_cnt = 0
+          for(j <- best_start until tokens(i).size; if len_cnt < wrds.size) {
+            len_cnt += tokens(i)(j).size
+            len += 1
+          }
+          return (i, new Chunk(best_start, best_start + len, ref._4))
+        }
+      }
+      (-1, null)
+    }
+
+    val refplaces = references.map(refFinder)
+
+    val refsorted = refplaces.foldLeft(Map[Int, List[Chunk[String]]]().withDefaultValue(List()))((m, itm) => {
+      if(itm._1 != -1) {
+        m.updated(itm._1, m(itm._1) :+ itm._2)
+      } else
+        m
+    })
+
+    val parses : Array[Tree[String]] = tokens.map(t => PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava))
+    // ... filter out the ones where the parses don't match, idk how that is going to effect
+    var tps = (tokens zip parses).filter((t) => t._1.length == t._2.getYield.size)
+
+    Logger.logss("done with "+inputFile)
+
+
+
+
+  }
+
+}

From 9b050c6c909bddf5c2a4a924521f09aa594dab5f Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Tue, 24 Feb 2015 18:16:22 -0800
Subject: [PATCH 03/25] -__-

---
 .../edu/berkeley/nlp/entity/Document.scala    |  4 +-
 .../edu/berkeley/nlp/entity/WikiDoc.scala     |  4 +-
 .../entity/preprocess/WikiPreprocessor.scala  | 76 ++++++++++++++++++-
 3 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/Document.scala b/src/main/java/edu/berkeley/nlp/entity/Document.scala
index 8a2ef9d..44555df 100644
--- a/src/main/java/edu/berkeley/nlp/entity/Document.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/Document.scala
@@ -8,13 +8,13 @@ trait Document {
   def docPartNo : Int
   // arrays of words in each sentence including punc
   def words : Seq[Seq[String]]
-  // the gram types of the words
+  // the grammar types of the words
   def pos : Seq[Seq[String]]
   // parse trees of each sentence
   def trees : Seq[DepConstTree]
   // I am guessing the type of the chunk eg: ORG-NAM
   def nerChunks : Seq[Seq[Chunk[String]]]
-  // have ranges and identifiers for the unique item that they are referenceing
+  // have ranges and identifiers for the unique item that they are referencing
   // appears [start, end)
   def corefChunks : Seq[Seq[Chunk[Int]]]
   // just use "-" for each in the case that the speaker is unknown
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
index bcec448..343703b 100644
--- a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
@@ -10,7 +10,8 @@ case class WikiDoc (docID : String,
                     trees: Seq[DepConstTree],
                     nerChunks : Seq[Seq[Chunk[String]]],
                     corefChunks : Seq[Seq[Chunk[Int]]],
-                    speakers : Seq[Seq[String]] ) extends Document {
+                    speakers : Seq[Seq[String]],
+                    wikiRefChunks : Seq[Seq[Chunk[String]]] ) extends Document {
 
   override val numSents = words.size;
 
@@ -30,5 +31,6 @@ case class WikiDoc (docID : String,
 
   override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = None;
 
+  //override def corefChunks = throw new NotImplementedError()
 
 }
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
index e6ed112..bf7a7e3 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -3,11 +3,14 @@ package edu.berkeley.nlp.entity.preprocess
 import java.io.File
 
 import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
-import edu.berkeley.nlp.entity.{Chunk, WikiDocReader}
+import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder
+import edu.berkeley.nlp.entity.{DepConstTree, WikiDoc, Chunk, WikiDocReader}
 import edu.berkeley.nlp.entity.ner.NerSystemLabeled
 import edu.berkeley.nlp.futile.util.Logger
 import edu.berkeley.nlp.syntax.Tree
+import edu.berkeley.nlp.futile.fig.basic.Indexer
 
+import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 import scala.xml._
 import scala.concurrent._
 import scala.collection.JavaConverters._
@@ -19,6 +22,8 @@ import ExecutionContext.Implicits.global
  */
 object WikiPreprocessor {
 
+  val headFinder = new ModCollinsHeadFinder()
+
   def processesDocs (inputDir : String, outputDir : String,
                      docReader : WikiDocReader,
                      splitter : SentenceSplitter,
@@ -40,6 +45,43 @@ object WikiPreprocessor {
               parser : CoarseToFineMaxRuleParser,
               backoffParser : CoarseToFineMaxRuleParser,
               nerSystem : NerSystemLabeled) = {
+    val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem)
+
+  }
+
+  def wikiToLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+    val ret = ListBuffer[Array[String]]()
+    for(i <- 0 until wdoc.words.size) {
+    //  val rend = PreprocessingDriver.renderSentenceConllLines(wdoc.docID, 0, wdoc.words(i), )
+      //ret.append("test")
+    }
+    ret.toSeq.map(_.toSeq)
+  }
+
+  def computeCorefBits[T](cr : Seq[Chunk[T]]) : Array[String] = {
+    var ret = new Array[String](cr.size)
+    for(i <- 0 until cr.size) {
+      var sb = new StringBuilder
+      for(c <- cr) {
+
+        if(c.start == i) {
+          sb.append("(")
+          sb.append(c.label)
+        }
+        if(c.end == i + 1)
+          sb.append(")")
+
+      }
+    }
+    ret
+  }
+
+  def mkWikiDoc(inputFile : String,
+              docReader : WikiDocReader,
+              splitter : SentenceSplitter,
+              parser : CoarseToFineMaxRuleParser,
+              backoffParser : CoarseToFineMaxRuleParser,
+              nerSystem : NerSystemLabeled) : WikiDoc = {
     /*String docName = inputPath;
     String[] lines = IOUtils.readLinesHard(inputPath).toArray(new String[0]);
     String[] canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(lines, respectInputLineBreaks, respectInputTwoLineBreaks);
@@ -87,6 +129,8 @@ object WikiPreprocessor {
     val referencesFile = inputFile.replace("RawTexts", "Problems")
     val refxml = XML.loadFile(referencesFile)
     val document = scala.io.Source.fromFile(inputFile).mkString.split("\n")
+    val refname = (refxml \ "ReferenceFileName")(0).text.trim
+
 
     val references = (refxml \ "ReferenceInstance").map(r => (
       (r \ "SurfaceForm")(0).text.trim,
@@ -125,7 +169,7 @@ object WikiPreprocessor {
           var best_rank = Double.NegativeInfinity
 
           for(j <- 0 until tokens(i).size) {
-            val r = rank_match(i,j) * Math.abs(ll - tcnt) // try and make the item close to where it should be
+            val r = rank_match(i,j) / Math.abs(ll - tcnt) // try and make the item close to where it should be
             if(r > best_rank) {
               best_start = j
               best_rank = r
@@ -155,13 +199,37 @@ object WikiPreprocessor {
 
     val parses : Array[Tree[String]] = tokens.map(t => PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava))
     // ... filter out the ones where the parses don't match, idk how that is going to effect
-    var tps = (tokens zip parses).filter((t) => t._1.length == t._2.getYield.size)
+    val tps = (tokens, parses, 0 until tokens.size).zipped
+      .filter((a,b,c) => a.length == b.getYield.size)
 
-    Logger.logss("done with "+inputFile)
+    //val indexer = new Indexer[String]()
 
+    val pos = tps._2.map(t => { new ArrayBuffer[String] ++ t.getPreTerminalYield.asScala })
 
+    val trees = for(i <- 0 until tps._1.size) yield {
+      val childParentMap = DepConstTree.extractDependencyStructure(tps._2(i), headFinder)
+      new DepConstTree(tps._2(i), pos(i), tps._1(i), childParentMap)
+    }
 
+    val wikiDoc = new WikiDoc(
+      docID=inputFile,
+      docPartNo=refname.toInt,
+      words=tps._1.toSeq.map(_.toSeq),
+      pos=null, // todo
+      trees=tps._2.toSeq.map(t => {
+        new DepConstTree(t, )
+      }),
+      nerChunks=null, // todo
+      corefChunks=tps._3.map(i => {
+        refsorted(i).map(_.hashCode).asInstanceOf[Seq[Int]]
+      }).asInstanceOf[Seq[Seq[Int]]],
+      speakers=null,
+      wikiRefChunks=tps._3.map(refsorted(_))
+    )
+
+    Logger.logss("done with "+inputFile)
 
+    wikiDoc
   }
 
 }

From deab92afc07048f417678d7f1bfdc78bf9648b79 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Tue, 24 Feb 2015 18:42:28 -0800
Subject: [PATCH 04/25] wiki doc appears to be correctly put together now

---
 .../entity/preprocess/WikiPreprocessor.scala  | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
index bf7a7e3..92b475a 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -7,7 +7,7 @@ import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder
 import edu.berkeley.nlp.entity.{DepConstTree, WikiDoc, Chunk, WikiDocReader}
 import edu.berkeley.nlp.entity.ner.NerSystemLabeled
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.syntax.Tree
+import edu.berkeley.nlp.futile.syntax.Tree
 import edu.berkeley.nlp.futile.fig.basic.Indexer
 
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
@@ -197,12 +197,13 @@ object WikiPreprocessor {
         m
     })
 
-    val parses : Array[Tree[String]] = tokens.map(t => PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava))
+    val parses : Array[Tree[String]] = tokens.map(t => Reprocessor.convertToFutileTree(
+      PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava)))
     // ... filter out the ones where the parses don't match, idk how that is going to effect
     val tps = (tokens, parses, 0 until tokens.size).zipped
       .filter((a,b,c) => a.length == b.getYield.size)
 
-    //val indexer = new Indexer[String]()
+    val indexer = new Indexer[String]()
 
     val pos = tps._2.map(t => { new ArrayBuffer[String] ++ t.getPreTerminalYield.asScala })
 
@@ -211,19 +212,19 @@ object WikiPreprocessor {
       new DepConstTree(tps._2(i), pos(i), tps._1(i), childParentMap)
     }
 
+    val empty = tps._1.map(l => (0 until l.length).map(a=>"-")).toSeq
+
     val wikiDoc = new WikiDoc(
       docID=inputFile,
       docPartNo=refname.toInt,
       words=tps._1.toSeq.map(_.toSeq),
-      pos=null, // todo
-      trees=tps._2.toSeq.map(t => {
-        new DepConstTree(t, )
-      }),
-      nerChunks=null, // todo
+      pos=pos,
+      trees=trees,
+      nerChunks=tps._1.map(a=>Seq()), // todo
       corefChunks=tps._3.map(i => {
-        refsorted(i).map(_.hashCode).asInstanceOf[Seq[Int]]
-      }).asInstanceOf[Seq[Seq[Int]]],
-      speakers=null,
+        refsorted(i).map(c => new Chunk(c.start, c.end, indexer.getIndex(c.label)))
+      }),
+      speakers=empty, // todo?
       wikiRefChunks=tps._3.map(refsorted(_))
     )
 

From 3fdd5698f8101a6f865c6fa968d67b2a26ab8487 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sat, 28 Feb 2015 16:20:31 -0800
Subject: [PATCH 05/25] seems to be generating the correct output

---
 .../entity/preprocess/WikiPreprocessor.scala  | 106 ++++++++++++++----
 1 file changed, 87 insertions(+), 19 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
index 92b475a..82c75ea 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -9,6 +9,7 @@ import edu.berkeley.nlp.entity.ner.NerSystemLabeled
 import edu.berkeley.nlp.futile.util.Logger
 import edu.berkeley.nlp.futile.syntax.Tree
 import edu.berkeley.nlp.futile.fig.basic.Indexer
+import edu.berkeley.nlp.futile.fig.basic.IOUtils
 
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 import scala.xml._
@@ -46,36 +47,103 @@ object WikiPreprocessor {
               backoffParser : CoarseToFineMaxRuleParser,
               nerSystem : NerSystemLabeled) = {
     val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem)
+    val lines = wikiToConllLines(wdoc)
+    val wlines = wikiToWikiLines(wdoc)
+    //PreprocessingDriver.writeConllLines(wdoc.docID, lines.map(_.toArray).toArray, outputFile)
+    writeWikiLines(wdoc.docID, lines, outputFile)
+    writeWikiLines(wdoc.docID, wlines, outputFile.replace("raw", "wiki"))
+  }
 
+  def writeWikiLines(docID : String, lines : Seq[Seq[String]], outputFile : String) = {
+    var writer = IOUtils.openOutHard(outputFile)
+    writer.println("#begin document (" + docID + "); part 000")
+    lines.foreach(l => {
+      l.foreach(writer.println(_))
+      writer.println
+    })
+    writer.close()
   }
 
-  def wikiToLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
-    val ret = ListBuffer[Array[String]]()
-    for(i <- 0 until wdoc.words.size) {
-    //  val rend = PreprocessingDriver.renderSentenceConllLines(wdoc.docID, 0, wdoc.words(i), )
-      //ret.append("test")
+  def wikiToConllLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+    val ret = ListBuffer[Seq[String]]()
+    //ret.append("#begin document (" + wdoc.docID + "); part " + wdoc.docPartNo)
+    for(i <- 0 until wdoc.numSents) {
+      val parseBits = PreprocessingDriver.computeParseBits(Reprocessor.convertFromFutileTree(wdoc.trees(i).constTree))
+      //val nerBits = PreprocessingDriver.computeNerBits(wdoc.nerChunks(i).toArray)
+      val corefBits = computeBits(wdoc.corefChunks(i), wdoc.words(i).size)
+      var lines = new ListBuffer[String]()
+      // conll: [doc name] [part num] [word num] [word] [pos] [parsebit] [6] [7] [8] [speakers] [nerbit] [corefbit]
+      for(j <- 0 until wdoc.words(i).size) {
+        lines.append(wdoc.docID + "\t" +
+          wdoc.docPartNo + "\t" +
+          j + "\t" +
+          wdoc.words(i)(j) + "\t" +
+          wdoc.pos(i)(j) + "\t" +
+          parseBits(j) + "\t" +
+          "\t-\t-\t-\t" +
+          "-\t" + // speakers
+          "-\t" + // nerbit
+          corefBits(j) + "\t" // coref bits
+        )
+      }
+      ret.append(lines.toSeq)
     }
-    ret.toSeq.map(_.toSeq)
+    ret.toSeq
   }
 
-  def computeCorefBits[T](cr : Seq[Chunk[T]]) : Array[String] = {
-    var ret = new Array[String](cr.size)
-    for(i <- 0 until cr.size) {
-      var sb = new StringBuilder
-      for(c <- cr) {
-
-        if(c.start == i) {
-          sb.append("(")
-          sb.append(c.label)
-        }
-        if(c.end == i + 1)
-          sb.append(")")
+  def computeBits[T](items : Seq[Chunk[T]], len : Int) : Array[String] = {
+    var ret = Array.fill(len)(List[String]())
+    items.foreach(c => {
+      if(c.start == c.end -1) {
+        ret(c.start) = ret(c.start) :+ ("(" + c.label + ")")
+      } else {
+        ret(c.start) = ret(c.start) :+ ("(" + c.label)
+        ret(c.end) = ret(c.end) :+ (c.label + ")")
+      }
+    })
+    ret.map(i => {if(i.isEmpty) "-" else i.reduce(_+"|"+_)})
+  }
 
+  def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+    val ret = ListBuffer[Seq[String]]()
+    for(i <- 0 until wdoc.numSents) {
+      val lines = new ListBuffer[String]()
+      for(j <- 0 until wdoc.words(i).size) {
+        var s = ""
+        wdoc.wikiRefChunks(i).foreach(c => {
+          if(c.start == j)
+            s = "(" + c.label
+        })
+        s += "*"
+        wdoc.wikiRefChunks(i).foreach(c => {
+          if(c.end == j + 1)
+            s += ")"
+        })
+        lines.append(s)
       }
+      ret.append(lines.toSeq)
     }
-    ret
+    ret.toSeq
   }
 
+//  def computeCorefBits[T](cr : Seq[Chunk[T]]) : Array[String] = {
+//    var ret = new Array[String](cr.size)
+//    for(i <- 0 until cr.size) {
+//      var sb = new StringBuilder
+//      for(c <- cr) {
+//
+//        if(c.start == i) {
+//          sb.append("(")
+//          sb.append(c.label)
+//        }
+//        if(c.end == i + 1)
+//          sb.append(")")
+//
+//      }
+//    }
+//    ret
+//  }
+
   def mkWikiDoc(inputFile : String,
               docReader : WikiDocReader,
               splitter : SentenceSplitter,

From 8636e264930fb64fe8225364c04429738773ed44 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sat, 28 Feb 2015 16:26:30 -0800
Subject: [PATCH 06/25] use threads

---
 Makefile                                                    | 3 +++
 .../berkeley/nlp/entity/preprocess/WikiPreprocessor.scala   | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index eab9e29..28f361a 100644
--- a/Makefile
+++ b/Makefile
@@ -12,3 +12,6 @@ aceTester: $(TARGET)
 
 queryModel: $(TARGET)
 	java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.QueryChooser -wikiDBPath models/wiki-db-ace.ser.gz
+
+wikiLimited: $(TARGET)
+	java -cp $(TARGET) edu.berkeley.nlp.entity.preprocess.PreprocessingDriver ++config/base.conf -inputDir ../WikificationACL2011Data/WikipediaSample/RawTextsTrain/ -outputDir /tmp/gggg/raw/ -mode WIKILIMITED
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
index 82c75ea..624c8d5 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -34,10 +34,10 @@ object WikiPreprocessor {
     new File(inputDir).listFiles.map(file => {
       val input_file = file.getAbsolutePath
       val output_file = outputDir + file.getName
-      //Future {
+      Future {
         process(input_file, output_file, docReader, splitter, parser, backoffParser, nerSystem)
-      //}
-    })//.foreach(Await.result(_, duration.Duration.Inf))
+      }
+    }).foreach(Await.result(_, duration.Duration.Inf))
   }
 
   def process(inputFile : String, outputFile : String,

From ce8ab3f4a7fb627bdfd600056932093d9f677fac Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sat, 28 Feb 2015 18:51:10 -0800
Subject: [PATCH 07/25] minor bug fixes

---
 Makefile                                      |  2 +-
 .../entity/preprocess/WikiPreprocessor.scala  | 26 ++++++++++++++++---
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 28f361a..61b103e 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ TARGET = target/scala-2.11/berkeley-entity-assembly-1.jar
 
 all: $(TARGET)
 
-$(TARGET): $(wildcard src/**)
+$(TARGET): $(wildcard src/**/*)
 	sbt assembly
 
 aceTester: $(TARGET)
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
index 624c8d5..d872f7f 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -35,7 +35,16 @@ object WikiPreprocessor {
       val input_file = file.getAbsolutePath
       val output_file = outputDir + file.getName
       Future {
-        process(input_file, output_file, docReader, splitter, parser, backoffParser, nerSystem)
+        try {
+          process(input_file, output_file, docReader, splitter, parser.newInstance, backoffParser.newInstance, nerSystem)
+        } catch {
+          case e : Exception => {
+            Logger.logss("failed file: "+input_file)
+            System.err.print(e.toString)
+            e.printStackTrace(System.err)
+            null
+          }
+        }
       }
     }).foreach(Await.result(_, duration.Duration.Inf))
   }
@@ -98,7 +107,7 @@ object WikiPreprocessor {
         ret(c.start) = ret(c.start) :+ ("(" + c.label + ")")
       } else {
         ret(c.start) = ret(c.start) :+ ("(" + c.label)
-        ret(c.end) = ret(c.end) :+ (c.label + ")")
+        ret(c.end - 1) = ret(c.end - 1) :+ (c.label + ")")
       }
     })
     ret.map(i => {if(i.isEmpty) "-" else i.reduce(_+"|"+_)})
@@ -265,8 +274,17 @@ object WikiPreprocessor {
         m
     })
 
-    val parses : Array[Tree[String]] = tokens.map(t => Reprocessor.convertToFutileTree(
-      PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava)))
+    val parses: Array[Tree[String]] = tokens.map(t => {
+      //try {
+        Reprocessor.convertToFutileTree(
+          PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava))
+      /*} catch {
+        case e : java.lang.NullPointerException => {
+          null;
+        }
+      }*/
+    })
+
     // ... filter out the ones where the parses don't match, idk how that is going to effect
     val tps = (tokens, parses, 0 until tokens.size).zipped
       .filter((a,b,c) => a.length == b.getYield.size)

From 4346d0382712cd8794347539bd4261b9d6a0b164 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sun, 1 Mar 2015 15:26:18 -0800
Subject: [PATCH 08/25] trying to run the wikipedia interface now

---
 build.sbt                                     |   1 +
 resources/Messages_de.properties              |   9 +
 resources/Messages_en.properties              |  36 ++
 resources/Messages_es.properties              |   8 +
 resources/Messages_fr.properties              |   8 +
 resources/Messages_it.properties              |   8 +
 resources/Messages_pt_BR.properties           |  38 ++
 resources/interwiki.properties                | 392 ++++++++++++++++++
 resources/operators.txt                       |  27 ++
 .../berkeley/nlp/entity/ConllDocReader.scala  |  15 +-
 .../berkeley/nlp/entity/WikiDocReader.scala   |  35 --
 .../entity/preprocess/WikiPreprocessor.scala  |  61 +--
 .../nlp/entity/wiki/WikipediaInterface.scala  |   2 +-
 13 files changed, 541 insertions(+), 99 deletions(-)
 create mode 100644 resources/Messages_de.properties
 create mode 100644 resources/Messages_en.properties
 create mode 100644 resources/Messages_es.properties
 create mode 100644 resources/Messages_fr.properties
 create mode 100644 resources/Messages_it.properties
 create mode 100644 resources/Messages_pt_BR.properties
 create mode 100644 resources/interwiki.properties
 create mode 100644 resources/operators.txt

diff --git a/build.sbt b/build.sbt
index 91a4b9b..a3fe7b7 100644
--- a/build.sbt
+++ b/build.sbt
@@ -10,3 +10,4 @@ assemblySettings
 
 mainClass in assembly := Some("edu.berkeley.nlp.entity.Driver")
 
+unmanagedResourceDirectories in Compile += { baseDirectory.value / "resources/" }
diff --git a/resources/Messages_de.properties b/resources/Messages_de.properties
new file mode 100644
index 0000000..51b38e9
--- /dev/null
+++ b/resources/Messages_de.properties
@@ -0,0 +1,9 @@
+wiki.tags.toc.content=Inhaltsverzeichnis
+wiki.api.url=http://de.wikipedia.org/w/api.php
+wiki.api.category1=Kategorie
+wiki.api.image1=Datei
+wiki.api.image2=Bild
+wiki.api.template1=Vorlage
+wiki.api.category2=Category
+wiki.api.image2=Image
+wiki.api.template2=Template
\ No newline at end of file
diff --git a/resources/Messages_en.properties b/resources/Messages_en.properties
new file mode 100644
index 0000000..6b9e2f0
--- /dev/null
+++ b/resources/Messages_en.properties
@@ -0,0 +1,36 @@
+wiki.tags.toc.content=Contents
+wiki.api.url=http://en.wikipedia.org/w/api.php
+wiki.api.media1=Media
+wiki.api.media2=Media
+wiki.api.special1=Special
+wiki.api.special2=Special
+wiki.api.talk1=Talk
+wiki.api.talk2=Talk
+wiki.api.user1=User
+wiki.api.user2=User
+wiki.api.usertalk1=User_talk
+wiki.api.usertalk2=User_talk
+wiki.api.meta1=Meta
+wiki.api.meta2=Meta
+wiki.api.metatalk1=Meta_talk
+wiki.api.metatalk2=Meta_talk
+wiki.api.image1=Image
+wiki.api.image2=File
+wiki.api.imagetalk1=Image_talk
+wiki.api.imagetalk2=File_talk
+wiki.api.mediawiki1=MediaWiki
+wiki.api.mediawiki2=MediaWiki
+wiki.api.mediawikitalk1=MediaWiki_talk
+wiki.api.mediawikitalk2=MediaWiki_talk
+wiki.api.template1=Template
+wiki.api.template2=Template
+wiki.api.templatetalk1=Template_talk
+wiki.api.templatetalk2=Template_talk
+wiki.api.help1=Help
+wiki.api.help2=Help
+wiki.api.helptalk1=Help_talk
+wiki.api.helptalk2=Help_talk
+wiki.api.category1=Category
+wiki.api.category2=Category
+wiki.api.categorytalk1=Category_talk
+wiki.api.categorytalk2=Category_talk
\ No newline at end of file
diff --git a/resources/Messages_es.properties b/resources/Messages_es.properties
new file mode 100644
index 0000000..bc50428
--- /dev/null
+++ b/resources/Messages_es.properties
@@ -0,0 +1,8 @@
+wiki.tags.toc.content=Contenido
+wiki.api.url=http://es.wikipedia.org/w/api.php
+wiki.api.category1=Categor\u00EDa
+wiki.api.image1=Imagen
+wiki.api.template1=Plantilla
+wiki.api.category2=Category
+wiki.api.image2=Image
+wiki.api.template2=Template
diff --git a/resources/Messages_fr.properties b/resources/Messages_fr.properties
new file mode 100644
index 0000000..2a76842
--- /dev/null
+++ b/resources/Messages_fr.properties
@@ -0,0 +1,8 @@
+wiki.tags.toc.content=Sommaire
+wiki.api.url=http://fr.wikipedia.org/w/api.php
+wiki.api.category1=Cat\u00E9gorie
+wiki.api.image1=Image
+wiki.api.template1=Mod\u00E8le
+wiki.api.category2=Category
+wiki.api.image2=Image
+wiki.api.template2=Template
\ No newline at end of file
diff --git a/resources/Messages_it.properties b/resources/Messages_it.properties
new file mode 100644
index 0000000..97778a3
--- /dev/null
+++ b/resources/Messages_it.properties
@@ -0,0 +1,8 @@
+wiki.tags.toc.content=Indice
+wiki.api.url=http://it.wikipedia.org/w/api.php
+wiki.api.category1=Categoria
+wiki.api.image1=Immagine
+wiki.api.template1=Template
+wiki.api.category2=Category
+wiki.api.image2=File
+wiki.api.template2=Template
\ No newline at end of file
diff --git a/resources/Messages_pt_BR.properties b/resources/Messages_pt_BR.properties
new file mode 100644
index 0000000..e0baaf7
--- /dev/null
+++ b/resources/Messages_pt_BR.properties
@@ -0,0 +1,38 @@
+#Generated by ResourceBundle Editor (http://eclipse-rbe.sourceforge.net)
+
+wiki.api.category1      = Categoria
+wiki.api.category2      = Categoria
+wiki.api.categorytalk1  = Categoria_falar
+wiki.api.categorytalk2  = Categoria_falar
+wiki.api.help1          = Ajuda
+wiki.api.help2          = Ajuda
+wiki.api.helptalk1      = Ajuda_falar
+wiki.api.helptalk2      = Ajuda_falar
+wiki.api.image1         = Imagem
+wiki.api.image2         = Arquivo
+wiki.api.imagetalk1     = Imagem_falar
+wiki.api.imagetalk2     = Arquivo_falar
+wiki.api.media1         = M\u00EDdia
+wiki.api.media2         = M\u00EDdia
+wiki.api.mediawiki1     = MediaWiki
+wiki.api.mediawiki2     = MediaWiki
+wiki.api.mediawikitalk1 = MediaWiki_falar
+wiki.api.mediawikitalk2 = MediaWiki_falar
+wiki.api.meta1          = Meta
+wiki.api.meta2          = Meta
+wiki.api.metatalk1      = Meta_falar
+wiki.api.metatalk2      = Meta_falar
+wiki.api.special1       = Especial
+wiki.api.special2       = Especial
+wiki.api.talk1          = Falar
+wiki.api.talk2          = Falar
+wiki.api.template1      = Modelo
+wiki.api.template2      = Modelo
+wiki.api.templatetalk1  = Modelo_falar
+wiki.api.templatetalk2  = Modelo_falar
+wiki.api.url            = http://br.wikipedia.org/w/api.php
+wiki.api.user1          = Usu\u00E1rio
+wiki.api.user2          = Usu\u00E1rio
+wiki.api.usertalk1      = Usu\u00E1rio_falar
+wiki.api.usertalk2      = Usu\u00E1rio_falar
+wiki.tags.toc.content   = Conte\u00FAdo
diff --git a/resources/interwiki.properties b/resources/interwiki.properties
new file mode 100644
index 0000000..b312b1e
--- /dev/null
+++ b/resources/interwiki.properties
@@ -0,0 +1,392 @@
+be-x-old=http://be-x-old.wikipedia.org/wiki/${title}
+tavi=http://tavi.sourceforge.net/${title}
+xh=http://xh.wikipedia.org/wiki/${title}
+lasvegaswiki=http://wiki.gmnow.com/index.php/${title}
+pmeg=http://www.bertilow.com/pmeg/${title}.php
+warpedview=http://www.warpedview.com/index.php/${title}
+slashdot=http://slashdot.org/article.pl?sid=${title}
+wikimedia=http://wikimediafoundation.org/wiki/${title}
+wikia=http://www.wikia.com/wiki/index.php/${title}
+wo=http://wo.wikipedia.org/wiki/${title}
+jefo=http://www.esperanto-jeunes.org/vikio/index.php?${title}
+openfacts=http://openfacts.berlios.de/index.phtml?title=${title}
+lqwiki=http://wiki.linuxquestions.org/wiki/${title}
+wa=http://wa.wikipedia.org/wiki/${title}
+ciscavate=http://ciscavate.org/index.php/${title}
+demokraatia=http://wiki.demokraatia.ee/
+efnetpythonwiki=http://purl.net/wiki/python/${title}
+mediazilla=http://bugzilla.wikipedia.org/${title}
+wikiquote=http://en.wikiquote.org/wiki/${title}
+jbo=http://jbo.wikipedia.org/wiki/${title}
+vo=http://vo.wikipedia.org/wiki/${title}
+vi=http://vi.wikipedia.org/wiki/${title}
+gamewiki=http://gamewiki.org/wiki/index.php/${title}
+hewikisource=http://he.wikisource.org/wiki/${title}
+ve=http://ve.wikipedia.org/wiki/${title}
+google=http://www.google.com/search?q=${title}
+uz=http://uz.wikipedia.org/wiki/${title}
+drumcorpswiki=http://www.drumcorpswiki.com/index.php/${title}
+nah=http://nah.wikipedia.org/wiki/${title}
+ur=http://ur.wikipedia.org/wiki/${title}
+jiniwiki=http://www.cdegroot.com/cgi-bin/jini?${title}
+uk=http://uk.wikipedia.org/wiki/${title}
+ug=http://ug.wikipedia.org/wiki/${title}
+osi=reference model=http://wiki.tigma.ee/
+mbtest=http://www.usemod.com/cgi-bin/mbtest.pl?${title}
+disinfopedia=http://www.disinfopedia.org/wiki.phtml?title=${title}
+ty=http://ty.wikipedia.org/wiki/${title}
+squeak=http://minnow.cc.gatech.edu/squeak/${title}
+tw=http://tw.wikipedia.org/wiki/${title}
+tlh=http://tlh.wikipedia.org/wiki/${title}
+tt=http://tt.wikipedia.org/wiki/${title}
+ts=http://ts.wikipedia.org/wiki/${title}
+tr=http://tr.wikipedia.org/wiki/${title}
+scoutpedia=http://www.scoutpedia.info/index.php/${title}
+minnan=http://zh-min-nan.wikipedia.org/wiki/${title}
+to=http://to.wikipedia.org/wiki/${title}
+tn=http://tn.wikipedia.org/wiki/${title}
+wikinfo=http://www.wikinfo.org/wiki.php?title=${title}
+s23wiki=http://is-root.de/wiki/index.php/${title}
+tl=http://tl.wikipedia.org/wiki/${title}
+aiwiki=http://www.ifi.unizh.ch/ailab/aiwiki/aiw.cgi?${title}
+tk=http://tk.wikipedia.org/wiki/${title}
+ti=http://ti.wikipedia.org/wiki/${title}
+th=http://th.wikipedia.org/wiki/${title}
+tg=http://tg.wikipedia.org/wiki/${title}
+fr.fr=http://fr.fr.wikinations.org/${title}
+te=http://te.wikipedia.org/wiki/${title}
+csb=http://csb.wikipedia.org/wiki/${title}
+theopedia=http://www.theopedia.com/${title}
+ta=http://ta.wikipedia.org/wiki/${title}
+acadwiki=http://xarch.tu-graz.ac.at/autocad/wiki/${title}
+efnetceewiki=http://purl.net/wiki/c/${title}
+phpwiki=http://phpwiki.sourceforge.net/phpwiki/index.php?${title}
+tmwiki=http://www.EasyTopicMaps.com/?page=${title}
+sw=http://sw.wikipedia.org/wiki/${title}
+benefitswiki=http://www.benefitslink.com/cgi-bin/wiki.cgi?${title}
+ecxei=http://www.ikso.net/cgi-bin/wiki.pl?${title}
+sv=http://sv.wikipedia.org/wiki/${title}
+uea=http://www.tejo.org/uea/${title}
+su=http://su.wikipedia.org/wiki/${title}
+st=http://st.wikipedia.org/wiki/${title}
+ss=http://ss.wikipedia.org/wiki/${title}
+sr=http://sr.wikipedia.org/wiki/${title}
+sq=http://sq.wikipedia.org/wiki/${title}
+so=http://so.wikipedia.org/wiki/${title}
+sn=http://sn.wikipedia.org/wiki/${title}
+sm=http://sm.wikipedia.org/wiki/${title}
+sl=http://sl.wikipedia.org/wiki/${title}
+sk=http://sk.wikipedia.org/wiki/${title}
+cache=http://www.google.com/search?q=cache:${title}
+svgwiki=http://www.protocol7.com/svg-wiki/default.asp?${title}
+si=http://si.wikipedia.org/wiki/${title}
+smikipedia=http://www.smikipedia.org/${title}
+simple=http://simple.wikipedia.org/wiki/${title}
+sh=http://sh.wikipedia.org/wiki/${title}
+sg=http://sg.wikipedia.org/wiki/${title}
+gentoo-wiki=http://gentoo-wiki.com/${title}
+se=http://se.wikipedia.org/wiki/${title}
+webseitzwiki=http://webseitz.fluxent.com/wiki/${title}
+sd=http://sd.wikipedia.org/wiki/${title}
+sc=http://sc.wikipedia.org/wiki/${title}
+jamwiki=http://jamwiki.org/wiki/en/${title}
+sa=http://sa.wikipedia.org/wiki/${title}
+greencheese=http://www.greencheese.org/${title}
+linuxwiki=http://www.linuxwiki.de/${title}
+diveintoosx=http://diveintoosx.org/${title}
+bridgeswiki=http://c2.com/w2/bridges/${title}
+rw=http://rw.wikipedia.org/wiki/${title}
+ru=http://ru.wikipedia.org/wiki/${title}
+corpknowpedia=http://corpknowpedia.org/wiki/index.php/${title}
+echei=http://www.ikso.net/cgi-bin/wiki.pl?${title}
+ro=http://ro.wikipedia.org/wiki/${title}
+rn=http://rn.wikipedia.org/wiki/${title}
+rm=http://rm.wikipedia.org/wiki/${title}
+wikispecies=http://species.wikipedia.org/wiki/${title}
+webdevwikinl=http://www.promo-it.nl/WebDevWiki/index.php?page=${title}
+sourceforge=http://sourceforge.net/${title}
+pythonwiki=http://www.pythonwiki.de/${title}
+roa-rup=http://roa-rup.wikipedia.org/wiki/${title}
+tmnet=http://www.technomanifestos.net/?${title}
+gmailwiki=http://www.gmailwiki.com/index.php/${title}
+plog4u=http://plog4u.org/index.php/${title}
+googlegroups=http://groups.google.com/groups?q=${title}
+wikiworld=http://WikiWorld.com/wiki/index.php/${title}
+qu=http://qu.wikipedia.org/wiki/${title}
+consciousness=http://teadvus.inspiral.org/
+eljwiki=http://elj.sourceforge.net/phpwiki/index.php/${title}
+lojban=http://www.lojban.org/tiki/tiki-index.php?page=${title}
+usej=http://www.tejo.org/usej/${title}
+tokipona=http://tokipona.wikipedia.org/wiki/${title}
+mathsongswiki=http://SeedWiki.com/page.cfm?wikiid=237&doc=${title}
+got=http://got.wikipedia.org/wiki/${title}
+shakti=http://cgi.algonet.se/htbin/cgiwrap/pgd/ShaktiWiki/${title}
+memoryalpha=http://www.memory-alpha.org/en/index.php/${title}
+cliki=http://ww.telent.net/cliki/${title} 
+pt=http://pt.wikipedia.org/wiki/${title}
+fr.ca=http://fr.ca.wikinations.org/${title}
+ps=http://ps.wikipedia.org/wiki/${title}
+fur=http://fur.wikipedia.org/wiki/${title}
+wikicities=http://www.wikicities.com/index.php/${title}
+pl=http://pl.wikipedia.org/wiki/${title}
+pi=http://pi.wikipedia.org/wiki/${title}
+wiktionary=http://en.wiktionary.org/wiki/${title}
+turismo=http://www.tejo.org/turismo/${title}
+pa=http://pa.wikipedia.org/wiki/${title}
+terrorwiki=http://www.liberalsagainstterrorism.com/wiki/index.php/${title}
+finalempire=http://final-empire.sourceforge.net/cgi-bin/wiki.pl?${title}
+fr.be=http://fr.wikinations.be/${title}
+os=http://os.wikipedia.org/wiki/${title}
+or=http://or.wikipedia.org/wiki/${title}
+netvillage=http://www.netbros.com/?${title}
+seattlewireless=http://seattlewireless.net/?${title}
+om=http://om.wikipedia.org/wiki/${title}
+pangalacticorg=http://www.pangalactic.org/Wiki/${title}
+seeds=http://www.IslandSeeds.org/wiki/${title}
+oc=http://oc.wikipedia.org/wiki/${title}
+raec=http://www.raec.clacso.edu.ar:8080/raec/Members/raecpedia/${title}
+ny=http://ny.wikipedia.org/wiki/${title}
+nv=http://nv.wikipedia.org/wiki/${title}
+foldoc=http://www.foldoc.org/foldoc/foldoc.cgi?${title}
+no=http://no.wikipedia.org/wiki/${title}
+nn=http://nn.wikipedia.org/wiki/${title}
+metawikipedia=http://meta.wikimedia.org/wiki/${title}
+wikif1=http://www.wikif1.org/${title}
+nl=http://nl.wikipedia.org/wiki/${title}
+ypsieyeball=http://sknkwrks.dyndns.org:1957/writewiki/wiki.pl?${title}
+ng=http://ng.wikipedia.org/wiki/${title}
+purlnet=http://purl.oclc.org/NET/${title}
+ne=http://ne.wikipedia.org/wiki/${title}
+nb=http://nb.wikipedia.org/wiki/${title}
+abbenormal=http://www.ourpla.net/cgi-bin/pikie.cgi?${title}
+na=http://na.wikipedia.org/wiki/${title}
+docbook=http://docbook.org/wiki/moin.cgi/${title}
+fr.org=http://fr.wikinations.org/${title}
+my=http://my.wikipedia.org/wiki/${title}
+brasilwiki=http://rio.ifi.unizh.ch/brasilienwiki/index.php/${title}
+mt=http://mt.wikipedia.org/wiki/${title}
+ms=http://ms.wikipedia.org/wiki/${title}
+mr=http://mr.wikipedia.org/wiki/${title}
+advogato=http://www.advogato.org/${title}
+senseislibrary=http://senseis.xmp.net/?${title}
+mo=http://mo.wikipedia.org/wiki/${title}
+mn=http://mn.wikipedia.org/wiki/${title}
+lutherwiki=http://www.lutheranarchives.com/mw/index.php/${title}
+ml=http://ml.wikipedia.org/wiki/${title}
+mk=http://mk.wikipedia.org/wiki/${title}
+mi=http://mi.wikipedia.org/wiki/${title}
+jspwiki=http://www.ecyrd.com/JSPWiki/Wiki.jsp?page=${title}
+mh=http://mh.wikipedia.org/wiki/${title}
+mg=http://mg.wikipedia.org/wiki/${title}
+metaweb=http://www.metaweb.com/wiki/wiki.phtml?title=${title}
+kmwiki=http://www.voght.com/cgi-bin/pywiki?${title}
+efnetxmlwiki=http://purl.net/wiki/xml/${title}
+tejo=http://www.tejo.org/vikio/${title}
+zwiki=http://www.zwiki.org/${title}
+lv=http://lv.wikipedia.org/wiki/${title}
+lt=http://lt.wikipedia.org/wiki/${title}
+lo=http://lo.wikipedia.org/wiki/${title}
+foxwiki=http://fox.wikis.com/wc.dll?Wiki~${title}
+ln=http://ln.wikipedia.org/wiki/${title}
+emacswiki=http://www.emacswiki.org/cgi-bin/wiki.pl?${title}
+li=http://li.wikipedia.org/wiki/${title}
+bemi=http://bemi.free.fr/vikio/index.php?${title}
+lg=http://lg.wikipedia.org/wiki/${title}
+wikibooks=http://en.wikibooks.org/wiki/${title}
+lb=http://lb.wikipedia.org/wiki/${title}
+la=http://la.wikipedia.org/wiki/${title}
+creationmatters=http://www.ourpla.net/cgi-bin/wiki.pl?${title}
+ky=http://ky.wikipedia.org/wiki/${title}
+kw=http://kw.wikipedia.org/wiki/${title}
+kv=http://kv.wikipedia.org/wiki/${title}
+pikie=http://pikie.darktech.org/cgi/pikie?${title}
+evowiki=http://www.evowiki.org/index.php/${title}
+ku=http://ku.wikipedia.org/wiki/${title}
+ks=http://ks.wikipedia.org/wiki/${title}
+kr=http://kr.wikipedia.org/wiki/${title}
+haribeau=http://wiki.haribeau.de/cgi-bin/wiki.pl?${title}
+ko=http://ko.wikipedia.org/wiki/${title}
+kn=http://kn.wikipedia.org/wiki/${title}
+km=http://km.wikipedia.org/wiki/${title}
+kl=http://kl.wikipedia.org/wiki/${title}
+kk=http://kk.wikipedia.org/wiki/${title}
+kj=http://kj.wikipedia.org/wiki/${title}
+ki=http://ki.wikipedia.org/wiki/${title}
+why=http://clublet.com/c/c/why?${title}
+kg=http://kg.wikipedia.org/wiki/${title}
+ka=http://ka.wikipedia.org/wiki/${title}
+mus=http://mus.wikipedia.org/wiki/${title}
+hrwiki=http://www.hrwiki.org/index.php/${title}
+orgpatterns=http://www.bell-labs.com/cgi-user/OrgPatterns/OrgPatterns?${title}
+jv=http://jv.wikipedia.org/wiki/${title}
+gotamac=http://www.got-a-mac.org/${title}
+dolphinwiki=http://www.object-arts.com/wiki/html/Dolphin/${title}
+zh-cn=http://zh.wikipedia.org/wiki/${title}
+visualworks=http://wiki.cs.uiuc.edu/VisualWorks/${title}
+iawiki=http://www.IAwiki.net/${title}
+freebsdman=http://www.FreeBSD.org/cgi/man.cgi?apropos=1&query=${title}
+ja=http://ja.wikipedia.org/wiki/${title}
+chy=http://chy.wikipedia.org/wiki/${title}
+unreal=http://wiki.beyondunreal.com/wiki/${title}
+iu=http://iu.wikipedia.org/wiki/${title}
+it=http://it.wikipedia.org/wiki/${title}
+is=http://is.wikipedia.org/wiki/${title}
+chr=http://chr.wikipedia.org/wiki/${title}
+usemod=http://www.usemod.com/cgi-bin/wiki.pl?${title}
+cmwiki=http://www.ourpla.net/cgi-bin/wiki.pl?${title}
+hammondwiki=http://www.dairiki.org/HammondWiki/index.php3?${title}
+cho=http://cho.wikipedia.org/wiki/${title}
+io=http://io.wikipedia.org/wiki/${title}
+personaltelco=http://www.personaltelco.net/index.cgi/${title}
+ik=http://ik.wikipedia.org/wiki/${title}
+haw=http://haw.wikipedia.org/wiki/${title}
+ii=http://ii.wikipedia.org/wiki/${title}
+wikisource=http://sources.wikipedia.org/wiki/${title}
+lugkr=http://lug-kr.sourceforge.net/cgi-bin/lugwiki.pl?${title}
+ig=http://ig.wikipedia.org/wiki/${title}
+zh-cfr=http://zh-min-nan.wikipedia.org/wiki/${title}
+ie=http://ie.wikipedia.org/wiki/${title}
+id=http://id.wikipedia.org/wiki/${title}
+ia=http://ia.wikipedia.org/wiki/${title}
+openwiki=http://openwiki.com/?${title}
+hz=http://hz.wikipedia.org/wiki/${title}
+hy=http://hy.wikipedia.org/wiki/${title}
+strikiwiki=http://ch.twi.tudelft.nl/~mostert/striki/teststriki.pl?${title}
+hu=http://hu.wikipedia.org/wiki/${title}
+herzkinderwiki=http://www.herzkinderinfo.de/Mediawiki/index.php/${title}
+ht=http://ht.wikipedia.org/wiki/${title}
+hr=http://hr.wikipedia.org/wiki/${title}
+webisodes=http://www.webisodes.org/${title}
+globalvoices=http://cyber.law.harvard.edu/dyn/globalvoices/wiki/${title}
+ho=http://ho.wikipedia.org/wiki/${title}
+hi=http://hi.wikipedia.org/wiki/${title}
+elibre=http://enciclopedia.us.es/index.php/${title}
+alife=http://news.alife.org/wiki/index.php?${title}
+he=http://he.wikipedia.org/wiki/${title}
+ast=http://ast.wikipedia.org/wiki/${title}
+ha=http://ha.wikipedia.org/wiki/${title}
+revo=http://purl.org/NET/voko/revo/art/${title}.html
+arxiv=http://www.arxiv.org/abs/${title}
+sockwiki=http://wiki.socklabs.com/${title}
+gv=http://gv.wikipedia.org/wiki/${title}
+gu=http://gu.wikipedia.org/wiki/${title}
+gn=http://gn.wikipedia.org/wiki/${title}
+gl=http://gl.wikipedia.org/wiki/${title}
+seapig=http://www.seapig.org/${title}
+gd=http://gd.wikipedia.org/wiki/${title}
+ga=http://ga.wikipedia.org/wiki/${title}
+opera7wiki=http://nontroppo.org/wiki/${title}
+oeis=http://www.research.att.com/cgi-bin/access.cgi/as/njas/sequences/eisA.cgi?Anum=${title}
+moinmoin=http://purl.net/wiki/moin/${title}
+fy=http://fy.wikipedia.org/wiki/${title}
+gej=http://www.esperanto.de/cgi-bin/aktivikio/wiki.pl?${title}
+fr=http://fr.wikipedia.org/wiki/${title}
+arc=http://arc.wikipedia.org/wiki/${title}
+fo=http://fo.wikipedia.org/wiki/${title}
+fj=http://fj.wikipedia.org/wiki/${title}
+wikinews=http://en.wikinews.org/wiki/${title}
+fi=http://fi.wikipedia.org/wiki/${title}
+ff=http://ff.wikipedia.org/wiki/${title}
+annotationwiki=http://www.seedwiki.com/page.cfm?wikiid=368&doc=${title}
+sep11=http://sep11.wikipedia.org/wiki/${title}
+wlug=http://www.wlug.org.nz/${title}
+fa=http://fa.wikipedia.org/wiki/${title}
+eu=http://eu.wikipedia.org/wiki/${title}
+tmbw=http://www.tmbw.net/wiki/index.php/${title}
+et=http://et.wikipedia.org/wiki/${title}
+scn=http://scn.wikipedia.org/wiki/${title}
+es=http://es.wikipedia.org/wiki/${title}
+muweb=http://www.dunstable.com/scripts/MuWebWeb?${title}
+eo=http://eo.wikipedia.org/wiki/${title}
+en=http://en.wikipedia.org/wiki/${title}
+dejanews=http://www.deja.com/=dnc/getdoc.xp?AN=${title}
+el=http://el.wikipedia.org/wiki/${title}
+jargonfile=http://sunir.org/apps/meta.pl?wiki=JargonFile&redirect=${title}
+eokulturcentro=http://esperanto.toulouse.free.fr/wakka.php?wiki=${title}
+ee=http://ee.wikipedia.org/wiki/${title}
+tum=http://tum.wikipedia.org/wiki/${title}
+plog4u_de=http://plog4u.de/index.php/${title}
+dz=http://dz.wikipedia.org/wiki/${title}
+dv=http://dv.wikipedia.org/wiki/${title}
+kerimwiki=http://wiki.oxus.net/${title}
+dk=http://da.wikipedia.org/wiki/${title}
+de=http://de.wikipedia.org/wiki/${title}
+dwjwiki=http://www.suberic.net/cgi-bin/dwj/wiki.cgi?${title}
+da=http://da.wikipedia.org/wiki/${title}
+wlwiki=http://winslowslair.supremepixels.net/wiki/index.php/${title}
+cy=http://cy.wikipedia.org/wiki/${title}
+w=http://en.wikipedia.org/wiki/${title}
+cv=http://cv.wikipedia.org/wiki/${title}
+cs=http://cs.wikipedia.org/wiki/${title}
+cr=http://cr.wikipedia.org/wiki/${title}
+q=http://en.wikiquote.org/wiki/${title}
+co=http://co.wikipedia.org/wiki/${title}
+zh-min-nan=http://zh-min-nan.wikipedia.org/wiki/${title}
+n=http://en.wikinews.org/wiki/${title} 
+m=http://meta.wikimedia.org/wiki/${title}
+annotation=http://bayle.stanford.edu/crit/nph-med.cgi/${title}
+ch=http://ch.wikipedia.org/wiki/${title}
+efnetcppwiki=http://purl.net/wiki/cpp/${title}
+ce=http://ce.wikipedia.org/wiki/${title}
+c2find=http://c2.com/cgi/wiki?FindPage&value=${title} 
+b=http://en.wikibooks.org/wiki/${title}
+ca=http://ca.wikipedia.org/wiki/${title}
+dictionary=http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=${title}
+ang=http://ang.wikipedia.org/wiki/${title}
+zh-tw=http://zh.wikipedia.org/wiki/${title}
+bs=http://bs.wikipedia.org/wiki/${title}
+br=http://br.wikipedia.org/wiki/${title}
+twiki=http://twiki.org/cgi-bin/view/${title}
+bo=http://bo.wikipedia.org/wiki/${title}
+wikt=http://en.wiktionary.org/wiki/${title}
+bn=http://bn.wikipedia.org/wiki/${title}
+bm=http://bm.wikipedia.org/wiki/${title}
+bi=http://bi.wikipedia.org/wiki/${title}
+bh=http://bh.wikipedia.org/wiki/${title}
+bg=http://bg.wikipedia.org/wiki/${title}
+knowhow=http://www2.iro.umontreal.ca/~paquetse/cgi-bin/wiki.cgi?${title}
+be=http://be.wikipedia.org/wiki/${title}
+wiki=http://c2.com/cgi/wiki?${title}
+patwiki=http://gauss.ffii.org/${title}
+ba=http://ba.wikipedia.org/wiki/${title}
+rfc=http://www.rfc-editor.org/rfc/rfc${title}.txt
+zu=http://zu.wikipedia.org/wiki/${title}
+lanifexwiki=http://opt.lanifex.com/cgi-bin/wiki.pl?${title}
+twistedwiki=http://purl.net/wiki/twisted/${title}
+az=http://az.wikipedia.org/wiki/${title}
+ay=http://ay.wikipedia.org/wiki/${title}
+commons=http://commons.wikimedia.org/wiki/${title}
+acronym=http://www.acronymfinder.com/af-query.asp?String=exact&Acronym=${title}
+av=http://av.wikipedia.org/wiki/${title}
+aspienetwiki=http://aspie.mela.de/Wiki/index.php?title=${title}
+as=http://as.wikipedia.org/wiki/${title}
+metawiki=http://sunir.org/apps/meta.pl?${title}
+ar=http://ar.wikipedia.org/wiki/${title}
+zh=http://zh.wikipedia.org/wiki/${title}
+pywiki=http://www.voght.com/cgi-bin/pywiki?${title}
+an=http://an.wikipedia.org/wiki/${title}
+am=http://am.wikipedia.org/wiki/${title}
+ak=http://ak.wikipedia.org/wiki/${title}
+infosecpedia=http://www.infosecpedia.org/pedia/index.php/${title}
+za=http://za.wikipedia.org/wiki/${title}
+af=http://af.wikipedia.org/wiki/${title}
+firstwiki=http://firstwiki.org/index.php/${title}
+als=http://als.wikipedia.org/wiki/${title}
+ab=http://ab.wikipedia.org/wiki/${title}
+aa=http://aa.wikipedia.org/wiki/${title}
+ursine=http://ursine.ca/${title}
+meatball=http://www.usemod.com/cgi-bin/mb.pl?${title}
+mozillawiki=http://wiki.mozilla.org/index.php/${title}
+imdb=http://us.imdb.com/Title?${title}
+pythoninfo=http://www.python.org/cgi-bin/moinmoin/${title}
+yo=http://yo.wikipedia.org/wiki/${title}
+seattlewiki=http://seattlewiki.org/wiki/${title}
+yi=http://yi.wikipedia.org/wiki/${title}
+vls=http://vls.wikipedia.org/wiki/${title}
+meta=http://meta.wikimedia.org/wiki/${title}
+susning=http://www.susning.nu/${title}
+nds=http://nds.wikipedia.org/wiki/${title}
+wikitravel=http://wikitravel.org/en/${title}
+codersbase=http://www.codersbase.com/${title}
+tpi=http://tpi.wikipedia.org/wiki/${title}
+ppr=http://c2.com/cgi/wiki?${title}
\ No newline at end of file
diff --git a/resources/operators.txt b/resources/operators.txt
new file mode 100644
index 0000000..7d9835d
--- /dev/null
+++ b/resources/operators.txt
@@ -0,0 +1,27 @@
+pre,-,PreMinus,4600
+pre,+,PrePlus,4600
+pre,not,Not,4600
+#
+in,^,Pow,3700
+#
+in,*,Times,3800
+in,/,Divide,3800
+in,div,Divide,3800
+in,mod,Mod,3800
+#
+in,+,Plus,2900
+in,-,Subtract,2900
+#
+in,round,Round,2800
+#
+in,=,Equal,2600
+in,!=,Unequal,2600
+in,<>,Unequal,2600
+in,>,Greater,2600
+in,>=,GreaterEqual,2600
+in,<,Less,2600
+in,<=,LessEqual,2600
+#
+in,and,And,2000
+#
+in,or,Or,1900
diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
index 9847abd..299fe02 100644
--- a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala
@@ -296,9 +296,18 @@ object ConllDocReader {
     var docCounter = 0;
     var fileIdx = 0;
     while (fileIdx < files.size && (size == -1 || docCounter < size)) {
-      val newDocs = reader.readConllDocs(files(fileIdx).getAbsolutePath);
-      docs ++= newDocs;
-      docCounter += newDocs.size
+      val pp = files(fileIdx).getAbsolutePath
+      try {
+        Logger.logss("Loading doc: " + pp)
+        val newDocs = reader.readConllDocs(pp);
+        docs ++= newDocs;
+        docCounter += newDocs.size
+      } catch {
+        case e : Exception => {
+          Logger.logss("failed document "+pp)
+          e.printStackTrace(System.err)
+        }
+      }
       fileIdx += 1;
     }
     val numDocs = if (size == -1) docs.size else Math.min(size, files.size);
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
index 8865ae4..72f5e05 100644
--- a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
@@ -15,43 +15,8 @@ import scala.xml._
  */
 class WikiDocReader (val lang : Language, val betterParsesFile : String = "") {
 
-  val betterParses = new HashMap[ArrayBuffer[String], Tree[String]]
-
-  // TODO: betterParsesFile
-
-  val headFinder = lang match {
-    case Language.ENGLISH => new ModCollinsHeadFinder()
-    case _ => throw new RuntimeException()
-  }
-
-  val sentenceSplitter = SentenceSplitter.loadSentenceSplitter("models/sentsplit.txt.gz")
-
   def readWikiDocs(fileName : String) : Seq[WikiDoc] = {
-    val referencesFile = fileName.replace("RawTexts", "Problems");
-    val refxml = XML.loadFile(referencesFile);
-    val document = scala.io.Source.fromFile(fileName).mkString
-
-    //val splits = sentenceSplitter.formCanonicalizedParagraphs(document.split(" "), false, false)
-    val splits  = sentenceSplitter.splitSentences(document.split("\n").filter(!_.trim.isEmpty))
-
-
-
-    for(reference <- refxml \ "ReferenceInstance") {
-      val surfaceForm = (reference \ "SurfaceForm")(0).text.trim
-      val offset = (reference \ "Offset")(0).text.trim.toInt
-      val length = (reference \ "Length")(0).text.trim.toInt
-      val chosenAnnotation = (reference \ "ChosenAnnotation")(0).text.trim
-      val annotatorId = (reference \ "AnnotatorId")(0).text.trim
-      val annotation = (reference \ "Annotation")(0).text.trim
-
-
-    }
 
-    // docID some unique identifier, filename
-    // partNo some int cnt
-    // words an array of sentences
-    // trees set of parse trees for a given sentence entity.DepConstTree
-    // nerchunks entity.Chunk
 
 
     Seq[WikiDoc]()
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
index d872f7f..3b501c1 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -91,7 +91,7 @@ object WikiPreprocessor {
           parseBits(j) + "\t" +
           "\t-\t-\t-\t" +
           "-\t" + // speakers
-          "-\t" + // nerbit
+          "*\t" + // nerbit
           corefBits(j) + "\t" // coref bits
         )
       }
@@ -135,23 +135,6 @@ object WikiPreprocessor {
     ret.toSeq
   }
 
-//  def computeCorefBits[T](cr : Seq[Chunk[T]]) : Array[String] = {
-//    var ret = new Array[String](cr.size)
-//    for(i <- 0 until cr.size) {
-//      var sb = new StringBuilder
-//      for(c <- cr) {
-//
-//        if(c.start == i) {
-//          sb.append("(")
-//          sb.append(c.label)
-//        }
-//        if(c.end == i + 1)
-//          sb.append(")")
-//
-//      }
-//    }
-//    ret
-//  }
 
   def mkWikiDoc(inputFile : String,
               docReader : WikiDocReader,
@@ -159,48 +142,6 @@ object WikiPreprocessor {
               parser : CoarseToFineMaxRuleParser,
               backoffParser : CoarseToFineMaxRuleParser,
               nerSystem : NerSystemLabeled) : WikiDoc = {
-    /*String docName = inputPath;
-    String[] lines = IOUtils.readLinesHard(inputPath).toArray(new String[0]);
-    String[] canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(lines, respectInputLineBreaks, respectInputTwoLineBreaks);
-    String[] sentences = null;
-    if (skipSentenceSplitting) {
-      sentences = canonicalizedParagraphs;
-    } else {
-      sentences = splitter.splitSentences(canonicalizedParagraphs);
-    }
-    String[][] tokenizedSentences = (useAlternateTokenizer ? splitter.tokenizeAlternate(sentences) : splitter.tokenize(sentences));
-    Logger.logss("Document " + docName + " contains " + lines.length + " lines and " + tokenizedSentences.length + " sentences");
-    String[][] docConllLines = renderDocConllLines(docName, tokenizedSentences, parser, backoffParser, nerSystem);
-    writeConllLines(docName, docConllLines, outputPath);
-*/
-
-     /*
-        String[][] conllLines = new String[tokenizedSentences.length][];
-    for (int sentIdx = 0; sentIdx < tokenizedSentences.length; sentIdx++) {
-      String[] tokenizedSentence = tokenizedSentences[sentIdx];
-      Tree<String> parse = parse(parser, backoffParser, Arrays.asList(tokenizedSentence));
-      if (parse.getYield().size() != tokenizedSentence.length) {
-        Logger.logss("WARNING: couldn't parse sentence, dropping it: " + Arrays.toString(tokenizedSentence));
-        Logger.logss("  (This will be fixed to backing off to an X-bar grammar in a future release)");
-      } else {
-        String[] posTags = new String[tokenizedSentence.length];
-        List<String> preterminals = parse.getPreTerminalYield();
-        for (int i = 0; i < preterminals.size(); i++) {
-          posTags[i] = preterminals.get(i);
-        }
-        String[] nerBioLabels = null;
-        if (nerSystem != null) {
-          nerBioLabels = nerSystem.tagBIO(tokenizedSentence, posTags);
-        } else {
-          nerBioLabels = new String[tokenizedSentence.length];
-          Arrays.fill(nerBioLabels, "O");
-        }
-        conllLines[sentIdx] = renderSentenceConllLines(docName, 0, tokenizedSentence, posTags, parse, nerBioLabels);
-      }
-    }
-    return conllLines;
-
-    */
 
     Logger.logss("starting processing of " + inputFile)
     val referencesFile = inputFile.replace("RawTexts", "Problems")
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index 69fd469..00f4c26 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -26,7 +26,7 @@ import edu.berkeley.nlp.entity.wiki._
  * java -cp /path/to/jar -Xmx8g edu.berkeley.nlp.entity.wiki.WikipediaInterface \
  *  -datasetPaths path/to/test-docs-directory-one-doc-per-file,path/to/additional/docs,... \
  *  -wikipediaDumpPath path/to/enwiki-latest-pages-articles.xml
- *  -outputDir path/to/output-file.ser.gz
+ *  -outputPath path/to/output-file.ser.gz
  *
  * Required arguments:
  * -datasetPaths: pointer to CoNLL-formatted files whose mentions we should extract

From 419e35c0e75ada31a5ea3e49b50d85a10348357b Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Tue, 3 Mar 2015 12:03:50 -0800
Subject: [PATCH 09/25] changes to simply seralize the wiki documents

---
 .../berkeley/nlp/entity/DepConstTree.scala    |  3 +-
 .../edu/berkeley/nlp/entity/WikiDoc.scala     |  1 +
 .../berkeley/nlp/entity/WikiDocReader.scala   | 36 +++++--------------
 .../entity/preprocess/WikiPreprocessor.scala  | 13 ++++---
 4 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala b/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala
index 641cd4c..31a0d06 100644
--- a/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala
@@ -16,10 +16,11 @@ import java.util.Collections
 import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder
 import edu.berkeley.nlp.futile.ling.CollinsHeadFinder
 
+@SerialVersionUID(1L)
 class DepConstTree(val constTree: Tree[String],
                    val pos: Seq[String],
                    val words: Seq[String],
-                   val childParentDepMap: HashMap[Int,Int]) {
+                   val childParentDepMap: HashMap[Int,Int]) extends Serializable {
   require(childParentDepMap.keys.toSeq.sorted.sameElements((0 until words.size)), PennTreeRenderer.render(constTree));
   
   def size = words.size;
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
index 343703b..fc1ab62 100644
--- a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala
@@ -3,6 +3,7 @@ package edu.berkeley.nlp.entity
 /**
  * Created by matthew on 2/18/15.
  */
+@SerialVersionUID(1L)
 case class WikiDoc (docID : String,
                     docPartNo : Int,
                     words : Seq[Seq[String]],
diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
index 72f5e05..2c2f6d8 100644
--- a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala
@@ -2,41 +2,23 @@ package edu.berkeley.nlp.entity
 
 import java.io.File
 
-import edu.berkeley.nlp.entity.lang.{ModCollinsHeadFinder, Language}
-import edu.berkeley.nlp.entity.preprocess.SentenceSplitter
-import edu.berkeley.nlp.futile.syntax.Tree
+import edu.berkeley.nlp.entity.lang.Language
 
-import scala.collection.immutable.HashMap
 import scala.collection.mutable.ArrayBuffer
-import scala.xml._
 
 /**
  * Created by matthew on 2/18/15.
  */
-class WikiDocReader (val lang : Language, val betterParsesFile : String = "") {
-
-  def readWikiDocs(fileName : String) : Seq[WikiDoc] = {
-
-
-
-    Seq[WikiDoc]()
-  }
-
-}
+class WikiDocReader (lang : Language, better : String) {} // TODO: remove
 
 object WikiDocReader {
   def loadRawWikiDocs(path : String, size : Int, suffix : String, lang : Language = Language.ENGLISH, betterParsesFile : String = "") : Seq[Document] = {
-    val rawDir = new File(path)
-    if (!rawDir.exists() || !rawDir.canRead() || rawDir.listFiles == null || rawDir.listFiles.isEmpty) {
-      throw new RuntimeException("Couldn't find directory " + path);
-    }
-    var rawFiles = rawDir.listFiles.map(_.getAbsolutePath())
-    //val files = rawFiles.filter(file => file.getAbsolutePath.endsWith(suffix));
-    val reader = new WikiDocReader(lang, betterParsesFile)
-    val docs = new ArrayBuffer[Document]
-    for(fname <- rawFiles) {
-      docs ++= reader.readWikiDocs(fname)
-    }
-    docs
+
+    var docs = GUtil.load(path).asInstanceOf[List[WikiDoc]]
+
+    if(size != -1 && docs.size > size)
+      docs.map(_.asInstanceOf[Document]).slice(0, size).toSeq
+    else
+      docs.map(_.asInstanceOf[Document]).toSeq
   }
 }
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
index 3b501c1..362d4e1 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -4,7 +4,7 @@ import java.io.File
 
 import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser
 import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder
-import edu.berkeley.nlp.entity.{DepConstTree, WikiDoc, Chunk, WikiDocReader}
+import edu.berkeley.nlp.entity._
 import edu.berkeley.nlp.entity.ner.NerSystemLabeled
 import edu.berkeley.nlp.futile.util.Logger
 import edu.berkeley.nlp.futile.syntax.Tree
@@ -31,7 +31,7 @@ object WikiPreprocessor {
                      parser : CoarseToFineMaxRuleParser,
                      backoffParser : CoarseToFineMaxRuleParser,
                      nerSystem : NerSystemLabeled) = {
-    new File(inputDir).listFiles.map(file => {
+    val wikiDocs = new File(inputDir).listFiles.map(file => {
       val input_file = file.getAbsolutePath
       val output_file = outputDir + file.getName
       Future {
@@ -46,7 +46,11 @@ object WikiPreprocessor {
           }
         }
       }
-    }).foreach(Await.result(_, duration.Duration.Inf))
+    }).map(f => {
+      Await.result(f, duration.Duration.Inf)
+      f.value.get.get
+    }).filter(_ != null).toList
+    GUtil.save(wikiDocs.asInstanceOf[Serializable], outputDir + "wiki-docs.doc.ser.gz")
   }
 
   def process(inputFile : String, outputFile : String,
@@ -54,13 +58,14 @@ object WikiPreprocessor {
               splitter : SentenceSplitter,
               parser : CoarseToFineMaxRuleParser,
               backoffParser : CoarseToFineMaxRuleParser,
-              nerSystem : NerSystemLabeled) = {
+              nerSystem : NerSystemLabeled) : WikiDoc = {
     val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem)
     val lines = wikiToConllLines(wdoc)
     val wlines = wikiToWikiLines(wdoc)
     //PreprocessingDriver.writeConllLines(wdoc.docID, lines.map(_.toArray).toArray, outputFile)
     writeWikiLines(wdoc.docID, lines, outputFile)
     writeWikiLines(wdoc.docID, wlines, outputFile.replace("raw", "wiki"))
+    wdoc
   }
 
   def writeWikiLines(docID : String, lines : Seq[Seq[String]], outputFile : String) = {

From e5b69f1644530ae337f0993f09d3be7c5f8014d1 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sat, 7 Mar 2015 20:40:33 -0800
Subject: [PATCH 10/25] hopefully fix some bugs

---
 .../entity/preprocess/WikiPreprocessor.scala  | 92 +++++++++++--------
 .../wiki/JointQueryDenotationChooser.scala    | 24 +++--
 .../nlp/entity/wiki/WikipediaInterface.scala  | 33 +++++--
 .../nlp/entity/wiki/WikipediaLinkDB.scala     |  2 +-
 4 files changed, 98 insertions(+), 53 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
index 362d4e1..a63f48b 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -31,24 +31,19 @@ object WikiPreprocessor {
                      parser : CoarseToFineMaxRuleParser,
                      backoffParser : CoarseToFineMaxRuleParser,
                      nerSystem : NerSystemLabeled) = {
-    val wikiDocs = new File(inputDir).listFiles.map(file => {
+    val wikiDocs = new File(inputDir).listFiles/*.par*/.map(file => {
       val input_file = file.getAbsolutePath
       val output_file = outputDir + file.getName
-      Future {
-        try {
-          process(input_file, output_file, docReader, splitter, parser.newInstance, backoffParser.newInstance, nerSystem)
-        } catch {
-          case e : Exception => {
-            Logger.logss("failed file: "+input_file)
-            System.err.print(e.toString)
-            e.printStackTrace(System.err)
-            null
-          }
+      try {
+        process(input_file, output_file, docReader, splitter, parser.newInstance, backoffParser.newInstance, nerSystem)
+      } catch {
+        case e : Exception => {
+          Logger.logss("failed file: "+input_file)
+          System.err.print(e.toString)
+          e.printStackTrace(System.err)
+          null
         }
       }
-    }).map(f => {
-      Await.result(f, duration.Duration.Inf)
-      f.value.get.get
     }).filter(_ != null).toList
     GUtil.save(wikiDocs.asInstanceOf[Serializable], outputDir + "wiki-docs.doc.ser.gz")
   }
@@ -61,6 +56,7 @@ object WikiPreprocessor {
               nerSystem : NerSystemLabeled) : WikiDoc = {
     val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem)
     val lines = wikiToConllLines(wdoc)
+    //val wlines = wiki.WikiAnnotReaderWriter.getWikiBits(wdoc.words.map(_.size), wdoc.wikiRefChunks)
     val wlines = wikiToWikiLines(wdoc)
     //PreprocessingDriver.writeConllLines(wdoc.docID, lines.map(_.toArray).toArray, outputFile)
     writeWikiLines(wdoc.docID, lines, outputFile)
@@ -118,7 +114,8 @@ object WikiPreprocessor {
     ret.map(i => {if(i.isEmpty) "-" else i.reduce(_+"|"+_)})
   }
 
-  def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+  /*def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+    // this does not handle multiple chunks on the same span well, but that shouldn't be an issue, since wiki docs shouldn't have that
     val ret = ListBuffer[Seq[String]]()
     for(i <- 0 until wdoc.numSents) {
       val lines = new ListBuffer[String]()
@@ -138,6 +135,21 @@ object WikiPreprocessor {
       ret.append(lines.toSeq)
     }
     ret.toSeq
+  }*/
+
+  def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = {
+    for (sentIdx <- 0 until wdoc.words.size) yield {
+      for (tokenIdx <- 0 until wdoc.words(sentIdx).size) yield {
+        val chunksStartingHere = wdoc.wikiRefChunks(sentIdx).filter(chunk => chunk.start == tokenIdx).sortBy(- _.end);
+        val numChunksEndingHere = wdoc.wikiRefChunks(sentIdx).filter(chunk => chunk.end - 1 == tokenIdx).size;
+        var str = if(chunksStartingHere.isEmpty) "" else {
+          chunksStartingHere.map("("+_.label.replace("(", "-LRB-").replace(")", "-RRB-").replace("*", "-STAR-")).reduce(_+"|"+_)
+        }
+        str += "*";
+        str += ")" * numChunksEndingHere
+        str;
+      }
+    }
   }
 
 
@@ -174,6 +186,10 @@ object WikiPreprocessor {
       val d = doclenratio * (ref._2 + ref._3 / 2.0)
       var cnt = 0
       val wrds = ref._1.replace(" ", "")
+
+      if(wrds.isEmpty) // wtf, how does not create an empty citation???
+        return (-1, null)
+
       def rank_match(i : Int, j : Int) : Double = {
         val res = tokens(i).drop(j).reduce(_+_)
         for(q <- 0 until Math.min(wrds.size, res.size)) {
@@ -182,33 +198,31 @@ object WikiPreprocessor {
         }
         1.0
       }
-      for(i <- 0 to sentences.size) {
-        cnt += sentences(i).size
-        if(cnt > d) {
-          // assume that the reference is in this sentence
-          var ll = cnt - sentences(i).size + d // estimated place in sentence
-          var tcnt = 0
-          var best_start = 0
-          var best_rank = Double.NegativeInfinity
-
-          for(j <- 0 until tokens(i).size) {
-            val r = rank_match(i,j) / Math.abs(ll - tcnt) // try and make the item close to where it should be
-            if(r > best_rank) {
-              best_start = j
-              best_rank = r
-            }
-            tcnt += tokens(i)(j).size
+      var best_start = 0
+      var best_rank = Double.NegativeInfinity
+      var best_sentence = 0
+      for(i <- 0 until sentences.size) {
+        var tcnt = 0
+        for(j <- 0 until tokens(i).size) {
+          val r = rank_match(i, j) / Math.log(Math.abs(d - cnt - tcnt) + 2) // little to simple, but works in most cases
+          if(r > best_rank) {
+            best_rank = r
+            best_start = j
+            best_sentence = i
           }
-          var len = 0
-          var len_cnt = 0
-          for(j <- best_start until tokens(i).size; if len_cnt < wrds.size) {
-            len_cnt += tokens(i)(j).size
-            len += 1
-          }
-          return (i, new Chunk(best_start, best_start + len, ref._4))
+          tcnt += tokens(i)(j).size + 1 // +1 to match the space
         }
+        cnt += sentences(i).size
+      }
+      var len = 0
+      var len_cnt = 0
+      for(j <- best_start until tokens(best_sentence).size; if len_cnt < wrds.size) {
+        len_cnt += tokens(best_sentence)(j).size
+        len += 1
       }
-      (-1, null)
+      if(len == 0)
+        return (-1, null)
+      (best_sentence, new Chunk(best_start, best_start + len, ref._4))
     }
 
     val refplaces = references.map(refFinder)
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 4d03771..9da4d77 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -3,16 +3,14 @@ package edu.berkeley.nlp.entity.wiki
 import edu.berkeley.nlp.entity.lang.Language
 import edu.berkeley.nlp.futile.LightRunner
 import edu.berkeley.nlp.entity.coref.CorefDocAssembler
-import edu.berkeley.nlp.entity.ConllDocReader
+import edu.berkeley.nlp.entity._
 import edu.berkeley.nlp.entity.coref.MentionPropertyComputer
-import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.futile.fig.basic.Indexer
 import edu.berkeley.nlp.entity.joint.LikelihoodAndGradientComputer
 import scala.collection.mutable.ArrayBuffer
 import edu.berkeley.nlp.entity.coref.CorefDoc
 import edu.berkeley.nlp.futile.math.SloppyMath
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.Chunk
 import edu.berkeley.nlp.entity.joint.GeneralTrainer
 
 /**
@@ -185,7 +183,7 @@ object JointQueryDenotationChooser {
   
   val trainDataPath = "data/ace05/train";
   val testDataPath = "data/ace05/dev";
-  val wikiPath = "data/ace05/ace05-all-conll-wiki"
+  val wikiPath = "data/ace05/ace05-all-conll-wiki" // contains the wiki links for both items
   val wikiDBPath = "models/wiki-db-ace.ser.gz"
   
   val lambda = 1e-8F
@@ -199,8 +197,22 @@ object JointQueryDenotationChooser {
     LightRunner.populateScala(JointQueryDenotationChooser.getClass(), args)
     // Read in CoNLL documents 
     val assembler = CorefDocAssembler(Language.ENGLISH, true);
-    val trainDocs = ConllDocReader.loadRawConllDocsWithSuffix(trainDataPath, -1, "", Language.ENGLISH);
-    val trainCorefDocs = trainDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None)));
+    val trainDocs = if(trainDataPath.startsWith("wikiser:")) {
+      WikiDocReader.loadRawWikiDocs(trainDataPath.split(":")(1), -1, "", Language.ENGLISH)
+    } else {
+      ConllDocReader.loadRawConllDocsWithSuffix(trainDataPath, -1, "", Language.ENGLISH)
+    };
+    val trainCorefDocs = trainDocs.map(doc => {
+      try {
+        assembler.createCorefDoc(doc, new MentionPropertyComputer(None))
+      } catch {
+        case e : Exception => {
+          // TODO: fix the wikidocument parser
+          println("failed document "+doc.docID)
+          null
+        }
+      }
+    }).filter(_!=null);
     
     // Read in gold Wikification labels
     val goldWikification = WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiPath)
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index 00f4c26..ebeb0d3 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -210,22 +210,41 @@ object WikipediaInterface {
     val mentionPropertyComputer = new MentionPropertyComputer(None);
     val pmAssembler = CorefDocAssembler(Language.ENGLISH, useGoldMentions = false);
     val gmAssembler = CorefDocAssembler(Language.ENGLISH, useGoldMentions = true);
-    val corefDocs = WikipediaInterface.datasetPaths.split(",").flatMap(path => {
-      if (WikipediaInterface.mentionType == "old") {
+    val corefDocs = WikipediaInterface.datasetPaths.split(",").flatMap(path_ => {
+      var path = path_
+      val mentionType = if(path.contains(":")) {
+        val s = path.split(":")
+        path = s(1)
+        s(0)
+      } else {
+        WikipediaInterface.mentionType
+      }
+      Logger.logss("Loading documents "+mentionType+" "+path)
+      if (mentionType == "old") {
         // Wikification dataset: use only auto_conll and pred mentions
         ConllDocReader.loadRawConllDocsWithSuffix(path, -1, "", Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer));
-      } else if (WikipediaInterface.mentionType == "ace") {
+      } else if (mentionType == "ace") {
         // ACE: Use gold mentions here
         ConllDocReader.loadRawConllDocsWithSuffix(path, -1, "", Language.ENGLISH).map(doc => gmAssembler.createCorefDoc(doc, mentionPropertyComputer));
-      } else if (WikipediaInterface.mentionType == "ontonotes") {
+      } else if (mentionType == "ontonotes") {
         // OntoNotes: use only auto_conll and pred mentions
         ConllDocReader.loadRawConllDocsWithSuffix(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer));
-      } else if (WikipediaInterface.mentionType == "wikipedia") {
-        WikiDocReader.loadRawWikiDocs(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer))
+      } else if (mentionType == "wikiser") {
+        WikiDocReader.loadRawWikiDocs(path, -1, docSuffix, Language.ENGLISH).map(doc => {
+          try {
+            gmAssembler.createCorefDoc(doc, mentionPropertyComputer)
+          } catch {
+            case e : Exception => {
+              // there are currently about 30 documents that are having an issue with their references
+              println("FAIL DOCUMENT: "+doc.docID)
+              null
+            }
+          }
+        })
       } else {
         throw new RuntimeException("Unrecognized mention type: " + WikipediaInterface.mentionType);
       }
-    });
+    }).filter(_!=null);
 //    val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet;
 
     // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents.
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
index cdcb894..f2f1f6a 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
@@ -124,7 +124,7 @@ object WikipediaLinkDB {
         }
       }
     }
-    val inLinksMapArrs = inLinksMap.map(entry => entry._1 -> entry._2.toArray);
+    val inLinksMapArrs = inLinksMap.map(entry => entry._1 -> entry._2.toArray); // TODO: WTF: inlinksmap is never written to
     val outLinksMapArrs = outLinksMap.map(entry => entry._1 -> entry._2.toArray);
     val sizes = Array.tabulate(10)(i => 0);
     for (key <- outLinksMapArrs.keySet) {

From 5535e73811cdf525ebc1bb8c97850e9681b371ae Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Thu, 12 Mar 2015 19:33:28 -0700
Subject: [PATCH 11/25] adding some comments

---
 .../java/edu/berkeley/nlp/entity/GUtil.scala  |  2 +-
 .../entity/preprocess/WikiPreprocessor.scala  |  2 +-
 .../wiki/JointQueryDenotationChooser.scala    | 68 ++++++++++++++-----
 .../entity/wiki/WikificationEvaluator.scala   | 27 ++++++++
 4 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/GUtil.scala b/src/main/java/edu/berkeley/nlp/entity/GUtil.scala
index 803cd6d..8031560 100644
--- a/src/main/java/edu/berkeley/nlp/entity/GUtil.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/GUtil.scala
@@ -406,7 +406,7 @@ object GUtil {
   def argMaxIdxFloat(values: Seq[Float]) = {
     var currIdx = 0;
     var maxIdx = 0;
-    var maxVal = Double.NegativeInfinity;
+    var maxVal = Float.NegativeInfinity;
     while (currIdx < values.size) {
       if (values(currIdx) > maxVal) {
         maxIdx = currIdx;
diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
index a63f48b..21af271 100644
--- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala
@@ -31,7 +31,7 @@ object WikiPreprocessor {
                      parser : CoarseToFineMaxRuleParser,
                      backoffParser : CoarseToFineMaxRuleParser,
                      nerSystem : NerSystemLabeled) = {
-    val wikiDocs = new File(inputDir).listFiles/*.par*/.map(file => {
+    val wikiDocs = new File(inputDir).listFiles.par.map(file => {
       val input_file = file.getAbsolutePath
       val output_file = outputDir + file.getName
       try {
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 9da4d77..23c025a 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -53,13 +53,17 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
    */
   def getUnnormalizedJointScores(ex: JointQueryDenotationExample, weights: Array[Float]): Array[Array[Float]] = {
     featurizeUseCache(ex, false)
+    // each example will have a number of features associated with each query
+    // each feature is an indicator, so we use the cache of the features indexes
+    // and sum the values of the features
     val rawQueryScores = ex.cachedFeatsEachQuery.map(feats => GUtil.scoreIndexedFeats(feats, weights));
+    // these are the weights from each query wrt the various word choices
     val queryDenotationMatrix = ex.cachedFeatsEachQueryDenotation.map(_.map(feats => GUtil.scoreIndexedFeats(feats, weights)));
     val scores = Array.tabulate(ex.queries.size, ex.allDenotations.size)((i, j) => Float.NegativeInfinity)
-    for (queryIdx <- 0 until ex.queries.size) {
-      for (denotationIdx <- 0 until ex.allDenotations.size) {
-        scores(queryIdx)(denotationIdx) = rawQueryScores(queryIdx) + queryDenotationMatrix(queryIdx)(denotationIdx)
-      }
+    for (queryIdx <- 0 until ex.queries.size; denotationIdx <- 0 until ex.allDenotations.size) {
+      // These are indicator weights, so by summing them we can compute the resulting value of choosing a given word
+      // and a given query by combining the results of the dot product of the query and the denotation
+      scores(queryIdx)(denotationIdx) = rawQueryScores(queryIdx) + queryDenotationMatrix(queryIdx)(denotationIdx)
     }
     scores
   }
@@ -70,7 +74,9 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
    */
   def getDenotationLogMarginals(ex: JointQueryDenotationExample, weights: Array[Float]): Array[Float] = {
     val scores = getUnnormalizedJointScores(ex, weights)
-    // Sum up each column
+    // the scores matrix contains log(p_{i,j}), so we are using
+    // logAdd to sum the probabilities
+    // as p(q,d) \propto e^(w^T f(q,d))
     val rawDenotationMarginals = Array.tabulate(ex.allDenotations.size)(i => SloppyMath.logAdd(scores.map(_(i))).toFloat)
     val normalizer = SloppyMath.logAdd(rawDenotationMarginals).toFloat
     (0 until rawDenotationMarginals.size).foreach(i => rawDenotationMarginals(i) -= normalizer)
@@ -136,6 +142,15 @@ class JointQueryDenotationChooser(val featureIndexer: Indexer[String],
     val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]());
     computer.computeDenotation(ex, weights)
   }
+
+  def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : Seq[String] = {
+    val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
+    val denotations = queries.map(query => wikiDB.disambiguateBestNoDisambig(query));
+    val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]());
+    val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
+
+    ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).map(_._1)
+  }
 }
 
 object JointQueryDenotationChooser {
@@ -180,6 +195,16 @@ object JointQueryDenotationChooser {
     exs;
   }
 
+
+  def loadDocuments(path : String) = {
+    val limit = 500 // -1
+    if(path.startsWith("wikiser:")) {
+      WikiDocReader.loadRawWikiDocs(path.split(":")(1), limit, "", Language.ENGLISH)
+    } else {
+      ConllDocReader.loadRawConllDocsWithSuffix(path, limit, "", Language.ENGLISH)
+    }
+  }
+
   
   val trainDataPath = "data/ace05/train";
   val testDataPath = "data/ace05/dev";
@@ -197,11 +222,7 @@ object JointQueryDenotationChooser {
     LightRunner.populateScala(JointQueryDenotationChooser.getClass(), args)
     // Read in CoNLL documents 
     val assembler = CorefDocAssembler(Language.ENGLISH, true);
-    val trainDocs = if(trainDataPath.startsWith("wikiser:")) {
-      WikiDocReader.loadRawWikiDocs(trainDataPath.split(":")(1), -1, "", Language.ENGLISH)
-    } else {
-      ConllDocReader.loadRawConllDocsWithSuffix(trainDataPath, -1, "", Language.ENGLISH)
-    };
+    val trainDocs = loadDocuments(trainDataPath);
     val trainCorefDocs = trainDocs.map(doc => {
       try {
         assembler.createCorefDoc(doc, new MentionPropertyComputer(None))
@@ -213,7 +234,11 @@ object JointQueryDenotationChooser {
         }
       }
     }).filter(_!=null);
-    
+
+    //val testDocs = ConllDocReader.loadRawConllDocsWithSuffix(testDataPath, -1, "", Language.ENGLISH);
+    val testDocs = loadDocuments(testDataPath)
+    val testCorefDocs = testDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None)));
+
     // Read in gold Wikification labels
     val goldWikification = WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiPath)
     // Read in the title given surface database
@@ -237,16 +262,27 @@ object JointQueryDenotationChooser {
     
     // Build the test examples and decode the test set
     // No filtering now because we're doing test
-    val testDocs = ConllDocReader.loadRawConllDocsWithSuffix(testDataPath, -1, "", Language.ENGLISH);
-    val testCorefDocs = testDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None)));
+
     val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = false);
-    val goldTestDenotationsAsTrivialChunks = (0 until testExs.size).map(i => new Chunk[Seq[String]](i, i+1, testExs(i).rawCorrectDenotations))
-    val predTestDenotationsAsTrivialChunks = (0 until testExs.size).map(i => new Chunk[String](i, i+1, chooser.pickDenotation(testExs(i).queries, wikiDB)))
+
+    val results = testExs.map(t => {
+      // TOD: need more then one perdicted title
+      (t.rawCorrectDenotations, chooser.pickDenotations(t.queries, wikiDB))
+    })
+
+    val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1))
+    val predTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[String](i, i+1, results(i)._2(0)))
     
     // Hacky but lets us reuse some code that normally evaluates things with variable endpoints
 //    WikificationEvaluator.evaluateWikiChunksBySent(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks))
     WikificationEvaluator.evaluateFahrniMetrics(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks), Set())
-    
+    //val outs = new PrintWRiter(System.out)
+    //WikificationEvaluator.writeWikificationRightAndWrong(outs, outs, )
+
+
+
+
+
     LightRunner.finalizeOutput();
   }
   
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
index cdb1566..9cbf642 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
@@ -78,6 +78,33 @@ object WikificationEvaluator {
     }
     Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom));
   }
+
+
+  // create sets of all the gold document references, and all the documents
+  // that we generate, and then compute an F1
+
+  /*def evaluateBOTF1_mfl(results : Seq[(String, Seq[String])]) = {
+    var correct = 0;
+    var precDenom = 0;
+    var recDenom = 0;
+    for (i <- 0 until results.size) {
+
+      for (title <- allPredTitles(i)) {
+        var markedCorrect = false;
+        for (goldTitleSet <- allGoldTitles(i)) {
+          markedCorrect = markedCorrect || isCorrect(goldTitleSet.toSeq, title);
+        }
+        if (markedCorrect) {
+          correct += 1;
+        }
+      }
+      precDenom += allPredTitles(i).size;
+      recDenom += allGoldTitles(i).size;
+    }
+    Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom));
+  }*/
+
+
   
   def convertChunksToBagOfTitles(titles: Iterable[Seq[Chunk[String]]]): Set[String] = {
     val bagOfTitles = titles.flatMap(sentTitles => {

From b28c1b8498429de1d822f056020fdab82cc3cd6a Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Fri, 13 Mar 2015 12:04:04 -0700
Subject: [PATCH 12/25] buggy f1 scorer

---
 .../wiki/JointQueryDenotationChooser.scala    | 11 +++--
 .../entity/wiki/WikificationEvaluator.scala   | 42 +++++++++----------
 2 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 23c025a..c3c5623 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -149,7 +149,7 @@ class JointQueryDenotationChooser(val featureIndexer: Indexer[String],
     val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]());
     val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
 
-    ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).map(_._1)
+    ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse.map(_._1)
   }
 }
 
@@ -266,8 +266,8 @@ object JointQueryDenotationChooser {
     val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = false);
 
     val results = testExs.map(t => {
-      // TOD: need more then one perdicted title
-      (t.rawCorrectDenotations, chooser.pickDenotations(t.queries, wikiDB))
+      // TODO: need more then one perdicted title
+      (t.rawCorrectDenotations, chooser.pickDenotations(t.queries, wikiDB), t.queries(0).originalMent.rawDoc)
     })
 
     val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1))
@@ -276,11 +276,10 @@ object JointQueryDenotationChooser {
     // Hacky but lets us reuse some code that normally evaluates things with variable endpoints
 //    WikificationEvaluator.evaluateWikiChunksBySent(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks))
     WikificationEvaluator.evaluateFahrniMetrics(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks), Set())
-    //val outs = new PrintWRiter(System.out)
-    //WikificationEvaluator.writeWikificationRightAndWrong(outs, outs, )
-
 
+    val mentionsByDoc = results.groupBy(_._3)
 
+    WikificationEvaluator.evaluateBOTF1_mfl(mentionsByDoc)
 
 
     LightRunner.finalizeOutput();
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
index 9cbf642..ecd411c 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
@@ -1,13 +1,14 @@
 package edu.berkeley.nlp.entity.wiki
 
-import edu.berkeley.nlp.entity.Chunk
+import edu.berkeley.nlp.entity.{Document, Chunk, GUtil}
 import edu.berkeley.nlp.futile.util.Logger
-import edu.berkeley.nlp.entity.GUtil
 import edu.berkeley.nlp.futile.util.Counter
 import scala.collection.JavaConverters._
 import edu.berkeley.nlp.entity.joint.JointDocACE
 import java.io.PrintWriter
 
+import scala.collection.mutable.ArrayBuffer
+
 object WikificationEvaluator {
   
   def removeExcludes(chunks: Seq[Chunk[String]]) = chunks.filter(chunk => chunk.label != ExcludeToken)
@@ -82,28 +83,27 @@ object WikificationEvaluator {
 
   // create sets of all the gold document references, and all the documents
   // that we generate, and then compute an F1
-
-  /*def evaluateBOTF1_mfl(results : Seq[(String, Seq[String])]) = {
-    var correct = 0;
-    var precDenom = 0;
-    var recDenom = 0;
-    for (i <- 0 until results.size) {
-
-      for (title <- allPredTitles(i)) {
-        var markedCorrect = false;
-        for (goldTitleSet <- allGoldTitles(i)) {
-          markedCorrect = markedCorrect || isCorrect(goldTitleSet.toSeq, title);
-        }
-        if (markedCorrect) {
-          correct += 1;
+  def evaluateBOTF1_mfl(results : Map[Document, Seq[(Seq[String], Seq[String], Document)]]) = {
+    // f1 = 2 * precision * recall / (percison + recall)
+    var correct = 0
+    var precDenom = 0
+    var recDenom = 0
+    for((doc, matches) <- results) {
+      var seenBefore = Set[String]()
+      for((gold, selected, _) <- matches) {
+        val goldS = Set(gold:_*)
+        val selectedS = Set(selected(0)) //Set(selected:_*)
+        val ints = goldS & selectedS
+        if(!ints.subsetOf(seenBefore)) {
+          correct += ints.size
+          seenBefore ++= ints
         }
       }
-      precDenom += allPredTitles(i).size;
-      recDenom += allGoldTitles(i).size;
+      precDenom += Set(matches.flatMap(_._2):_*).size
+      recDenom += Set(matches.flatMap(_._1):_*).size
     }
-    Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom));
-  }*/
-
+    Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom))
+  }
 
   
   def convertChunksToBagOfTitles(titles: Iterable[Seq[Chunk[String]]]): Set[String] = {

From 8b4bf0128b7c8651b975a8fb55688c84e4113e9b Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sat, 14 Mar 2015 01:00:38 -0700
Subject: [PATCH 13/25] fix f1 metric

---
 .../entity/wiki/WikificationEvaluator.scala   | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
index ecd411c..c4ad61f 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala
@@ -90,17 +90,33 @@ object WikificationEvaluator {
     var recDenom = 0
     for((doc, matches) <- results) {
       var seenBefore = Set[String]()
-      for((gold, selected, _) <- matches) {
+      val allGold = Set(matches.flatMap(_._1):_*)
+      val allChoosen = Set(matches.map(_._2(0)):_*) //Set(matches.flatMap(_._2):_*)
+
+      /*for((gold, selected, _) <- matches) {
         val goldS = Set(gold:_*)
         val selectedS = Set(selected(0)) //Set(selected:_*)
         val ints = goldS & selectedS
-        if(!ints.subsetOf(seenBefore)) {
+        //if(!ints.subsetOf(seenBefore)) {
           correct += ints.size
           seenBefore ++= ints
-        }
-      }
-      precDenom += Set(matches.flatMap(_._2):_*).size
-      recDenom += Set(matches.flatMap(_._1):_*).size
+        //}
+      }*/
+      // TODO: something wrong with computing the set intersection
+
+      val dprecDenom = allChoosen.size
+      val drecDenom = allGold.size
+      var dcorrect = 0
+      allChoosen.foreach(c => {
+        if(isCorrect(allGold.toSeq, c))
+          dcorrect += 1
+      })
+      //val diff = (allGold ++ allChoosen) -- (allGold & allChoosen)
+      //val dcorrect = (allGold & allChoosen).size
+      //Logger.logss("Document f1: "+GUtil.renderPRF1(dcorrect, dprecDenom, drecDenom))
+      precDenom += dprecDenom
+      recDenom += drecDenom
+      correct += dcorrect
     }
     Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom))
   }

From bb12bd1d6fe4ed8e400c9da43118489c800c910c Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Tue, 17 Mar 2015 16:23:25 -0700
Subject: [PATCH 14/25] some bug fixes

---
 .../wiki/JointQueryDenotationChooser.scala    | 52 +++++++++++++++----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index c3c5623..2a30f00 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -40,7 +40,7 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
                                          val featureIndexer: Indexer[String]) extends LikelihoodAndGradientComputer[JointQueryDenotationExample] {
   // Used for feature computation
   val queryChooser = new QueryChoiceComputer(wikiDB, featureIndexer)
-  
+
   def featurizeUseCache(ex: JointQueryDenotationExample, addToIndexer: Boolean) {
     if (ex.cachedFeatsEachQuery == null) {
       ex.cachedFeatsEachQuery = queryChooser.featurizeQueries(ex.queries, addToIndexer)
@@ -136,21 +136,26 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
 class JointQueryDenotationChooser(val featureIndexer: Indexer[String],
                                   val weights: Array[Float]) extends Serializable {
   
-  def pickDenotation(queries: Seq[Query], wikiDB: WikipediaInterface): String = {
+  /*def pickDenotation(queries: Seq[Query], wikiDB: WikipediaInterface): String = {
     val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
-    val denotations = queries.map(query => wikiDB.disambiguateBestNoDisambig(query));
+    val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
     val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]());
     computer.computeDenotation(ex, weights)
-  }
+  }*/
 
   def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : Seq[String] = {
     val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
-    val denotations = queries.map(query => wikiDB.disambiguateBestNoDisambig(query));
-    val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]());
+    val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
+    val dden = Query.extractDenotationSetWithNil(queries, denotations, 10)
+    val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]());
     val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
 
     ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse.map(_._1)
   }
+
+  def diffFeatures(correct: Query, choosen: Query, wikiDB: WikipediaInterface) = {
+
+  }
 }
 
 object JointQueryDenotationChooser {
@@ -197,7 +202,7 @@ object JointQueryDenotationChooser {
 
 
   def loadDocuments(path : String) = {
-    val limit = 500 // -1
+    val limit = numLoadedSamples//500
     if(path.startsWith("wikiser:")) {
       WikiDocReader.loadRawWikiDocs(path.split(":")(1), limit, "", Language.ENGLISH)
     } else {
@@ -216,6 +221,8 @@ object JointQueryDenotationChooser {
   val numItrs = 20
   
   val maxNumWikificationOptions = 7
+
+  val numLoadedSamples = -1 // for debugging by loading less samples
   
   def main(args: Array[String]) {
     LightRunner.initializeOutput(JointQueryDenotationChooser.getClass());
@@ -263,11 +270,37 @@ object JointQueryDenotationChooser {
     // Build the test examples and decode the test set
     // No filtering now because we're doing test
 
-    val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = false);
+    val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = true)//false);
+
+    var correctItemWasInSet = 0
 
     val results = testExs.map(t => {
       // TODO: need more then one perdicted title
-      (t.rawCorrectDenotations, chooser.pickDenotations(t.queries, wikiDB), t.queries(0).originalMent.rawDoc)
+      val picks = chooser.pickDenotations(t.queries, wikiDB)
+      if(!isCorrect(t.rawCorrectDenotations, picks(0))) {
+        // the pick is not correct, attempt to determine if there would have
+        // been a better pick that is in the picks list (which basically means all of the
+        /*if(picks.size > 1 && isCorrect(t.rawCorrectDenotations, picks(1))) {
+          // the correct pick was the second answer instead of the first one
+          // try and report the differences between the two items
+          println("second pick was correct")
+
+        }*/
+        var qq = false
+        for((p, i) <- picks.drop(1).zipWithIndex) {
+          // try: t.correctDenotations here?
+          if(isCorrect(t.correctDenotations, p) || isCorrect(t.rawCorrectDenotations, p)) {
+            println("Found correct item with "+i)
+            correctItemWasInSet += 1
+            qq = true
+            //println("found correct item")
+          }
+        }
+        if(!qq) {
+          println("???")
+        }
+      }
+      (t.rawCorrectDenotations, picks, t.queries(0).originalMent.rawDoc)
     })
 
     val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1))
@@ -280,6 +313,7 @@ object JointQueryDenotationChooser {
     val mentionsByDoc = results.groupBy(_._3)
 
     WikificationEvaluator.evaluateBOTF1_mfl(mentionsByDoc)
+    println("Number of correct items that were in the set: "+correctItemWasInSet)
 
 
     LightRunner.finalizeOutput();

From 8c77dfbd8f66f1f8f0993a61178e6f4ea620039f Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Wed, 25 Mar 2015 17:01:07 -0700
Subject: [PATCH 15/25] make the gold follow the redirect db as they currently
 reference old pages

---
 .../java/edu/berkeley/nlp/entity/coref/Mention.scala  | 11 +++++++++++
 .../coref/PairwiseIndexingFeaturizerJoint.scala       |  2 ++
 .../nlp/entity/wiki/JointQueryDenotationChooser.scala |  9 +++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
index 8069292..c31144a 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala
@@ -15,6 +15,7 @@ import edu.berkeley.nlp.entity.WordNetInterfacer
 // TODO: Extract an interface for ConllDoc so I don't have to keep the whole
 // document around...but while I'm feature engineering it's useful to be able
 // to put my hands on anything I want
+// ... ok settle down
 class Mention(val rawDoc: Document,
               val mentIdx: Int,
               val sentIdx: Int,
@@ -39,6 +40,16 @@ class Mention(val rawDoc: Document,
   var cachedNerPossibilities: Option[Chunk[Counter[String]]] = None;
   var cachedNerGold: Option[Chunk[String]] = None;
 
+  override def toString = {
+    var ret = "{"
+    if(startIdx > 1)
+      ret += rawDoc.words(sentIdx)(startIdx - 1) + " "
+    ret += "["+spanToString+"]"
+    if(endIdx < rawDoc.words(sentIdx).size-1)
+      ret += rawDoc.words(sentIdx)(endIdx+ 1)
+    ret + "}"
+  }
+
   def speaker = rawDoc.speakers(sentIdx)(headIdx);
 
   def headString = rawDoc.words(sentIdx)(headIdx);
diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala b/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala
index 31c32f6..21b1ac7 100644
--- a/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala
@@ -21,6 +21,8 @@ import edu.berkeley.nlp.entity.WordNetInterfacer
  * DO NOT try to add WordNetInterfacer here! It is not serializable and so
  * everything will explode when we try to serialize the model. So we choose
  * to cache it on the documents even though this is pretty hacky.
+ *
+ * TODO: maybe change to using "transient" fields re:^^
  */
 @SerialVersionUID(1L)
 class PairwiseIndexingFeaturizerJoint(val featureIndexer: Indexer[String],
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 2a30f00..0717bd7 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -146,7 +146,7 @@ class JointQueryDenotationChooser(val featureIndexer: Indexer[String],
   def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : Seq[String] = {
     val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
     val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
-    val dden = Query.extractDenotationSetWithNil(queries, denotations, 10)
+    val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions)
     val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]());
     val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
 
@@ -177,7 +177,8 @@ object JointQueryDenotationChooser {
           // There are multiple possible gold Wikipedia titles for some mentions. Note that
           // NIL (no entry in Wikipedia) is included as an explicit choice, so this includes NILs (as
           // it should according to how the task is defined)
-          val goldLabel = getGoldWikification(goldWikification(docName), ment)
+          val goldLabelp = getGoldWikification(goldWikification(docName), ment)
+          val goldLabel = (goldLabelp ++ goldLabelp.map(wikiDB.redirectsDB.followRedirect(_))).distinct
           if (goldLabel.size >= 1) {
             val queries = Query.extractQueriesBest(ment, true);
             val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_));
@@ -189,6 +190,10 @@ object JointQueryDenotationChooser {
 //            if (correctIndices.isEmpty && 
             if (filterImpossible && correctIndices.isEmpty) {
               numImpossible += 1;
+              println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations)
+              if(goldLabel.contains("Lord_Speaker")) {
+                println("wtfwtf")
+              }
             } else {
               exs += new JointQueryDenotationExample(queries, denotations, correctDenotations, goldLabel)
             }

From 1ff173dde61eab366482099b1d529238c44cdaa7 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Thu, 26 Mar 2015 23:55:03 -0700
Subject: [PATCH 16/25] attempt at adding more queries to find the matching
 page title

---
 .../wiki/JointQueryDenotationChooser.scala    |  6 +-
 .../edu/berkeley/nlp/entity/wiki/Query.scala  | 58 +++++++++++++++++--
 .../nlp/entity/wiki/QueryChooser.scala        |  2 +
 .../nlp/entity/wiki/WikipediaInterface.scala  | 20 +++++--
 4 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 0717bd7..43648c5 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -180,7 +180,11 @@ object JointQueryDenotationChooser {
           val goldLabelp = getGoldWikification(goldWikification(docName), ment)
           val goldLabel = (goldLabelp ++ goldLabelp.map(wikiDB.redirectsDB.followRedirect(_))).distinct
           if (goldLabel.size >= 1) {
+            //val oldqueries = Query.extractQueriesBest_old(ment, true);
             val queries = Query.extractQueriesBest(ment, true);
+            /*if(!(Set(oldqueries.map(_.getFinalQueryStr):_*) subsetOf Set(queries.map(_.getFinalQueryStr):_*))) {
+              println("failed")
+            }*/
             val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_));
 //            val denotations = queries.map(wikiDB.disambiguateBestNoDisambig(_));
             val denotations = Query.extractDenotationSetWithNil(queries, queryDisambigs, maxNumWikificationOptions);
@@ -225,7 +229,7 @@ object JointQueryDenotationChooser {
   val batchSize = 1
   val numItrs = 20
   
-  val maxNumWikificationOptions = 7
+  val maxNumWikificationOptions = 20 //7
 
   val numLoadedSamples = -1 // for debugging by loading less samples
   
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
index ce86957..e7f6c56 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
@@ -16,7 +16,8 @@ case class Query(val words: Seq[String],
                  val originalMent: Mention,
                  val finalSpan: (Int, Int),
                  val queryType: String,
-                 val removePuncFromQuery: Boolean = true) {
+                 val removePuncFromQuery: Boolean = true,
+                 val features: List[String] = List[String]()) {
   
   def getFinalQueryStr = {
     val wordsNoPunc = if (removePuncFromQuery) {
@@ -40,9 +41,16 @@ object Query {
   val PluralQueryExpand = true;
   val RemovePuncFromQuery = true;
   val UseFirstHead = true;
-  val MaxQueryLen = 4;
-  val BlackList = Set("the", "a", "my", "your", "his", "her", "our", "their", "its", "this", "that", "these", "those")
-  val PuncList = Set(',', '.', '!', '?', ':', ';', '\'', '"', '(', ')', '[', ']', '{', '}', ' ');
+  val MaxQueryLen = 8;
+  val BlackList = Set(
+    "the", "a", "my", "your", "his", "her", "our",
+    "their", "its", "this", "that", "these", "those",
+    "of"
+  )
+  val PuncList = Set(
+    ',', '.', '!', '?', ':', ';', '\'', '"', '(', ')',
+    '[', ']', '{', '}', ' '
+  )
   
   /**
    * Check if a token is "blacklisted", meaning that we shouldn't form a query that starts with
@@ -73,7 +81,7 @@ object Query {
    * considering different subsets of the words in the mention and munging capitalization and
    * stemming, since lowercasing and dropping a plural-marking "s" are useful for nominals.
    */
-  def extractQueriesBest(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = {
+  def extractQueriesBest_old(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = {
     val queries = new ArrayBuffer[Query];
     val mentWords = ment.words;
     // Try the whole query, then prefixes ending in the head
@@ -107,6 +115,46 @@ object Query {
 //    }
     queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]());
   }
+
+  def extractQueriesBest(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = {
+    val queries = new ArrayBuffer[Query]()
+    val mentWords = ment.words
+    val relHeadIdx = ment.contextTree.getSpanHeadACECustom(ment.startIdx, ment.endIdx) - ment.startIdx
+    def addQuery(start: Int, end: Int, featsi:List[String]): Unit = {
+      var feats = featsi // gaaaaa
+      val thisSlice = new ArrayBuffer[Query]()
+      val wrds = mentWords.slice(start, end)
+      thisSlice += new Query(wrds, ment, (start, end), "STD", RemovePuncFromQuery, feats)
+      val firstWord = wrds(0)
+      val lastWord = wrds(wrds.size - 1)
+      if((end - start)== 1)
+        feats ++= List("SingleItemQuery")
+      if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) {
+        thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", RemovePuncFromQuery, feats);
+      }
+      // Stemming (but only on head alone)
+      if (PluralQueryExpand && (end - start) == 1 && firstWord.last == 's') {
+        thisSlice ++= thisSlice.map(qu =>
+          new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", RemovePuncFromQuery, feats));
+      }
+      queries ++= thisSlice
+    }
+    addQuery(0, ment.endIdx - ment.startIdx, List("SimpleQuery", "FullTextQuery"))
+    // TODO: make this ignore items that simply add a blacklisted word
+    for(i <- 0 to relHeadIdx) {
+      addQuery(i, relHeadIdx + 1, List("SimpleQuery", "PreHeadQuery"))
+    }
+    for(i <- relHeadIdx+1 until mentWords.size) {
+      addQuery(relHeadIdx, i, List("SimpleQuery", "PostHeadQuery"))
+    }
+    // try filtering words
+    val filterWords = mentWords.filter(!isBlacklisted(_, 0))
+    if(filterWords.size != mentWords.size) {
+      // we lost something, make new query
+      queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", RemovePuncFromQuery, List("FilteredQuery"))
+    }
+    queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]())
+  }
   
   def extractDenotationSetWithNil(queries: Seq[Query], queryDisambigs: Seq[Counter[String]], maxDenotations: Int): Seq[String] = {
     val choicesEachQuery = queryDisambigs.map(_.getSortedKeys().asScala);
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
index e3b4d32..90538cb 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
@@ -123,6 +123,8 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
       val longQuery = tagsWithin.size > 3;
       feat("DescriptorQueryTags=" + queryDescriptor + "-" + contextTag + (if (longQuery) "...") + tagsWithin.slice(Math.max(0, tagsWithin.size - 3), tagsWithin.size).toString);
       feat("DescriptorHead=" + queryDescriptor + "-" + binSize(querySize) + "-" + ment.headStringLc);
+      for(f <- query.features)
+        feat(f)
       feats.toArray;
     });
   }
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index ebeb0d3..88bcdd3 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -94,19 +94,31 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
   }
   
   def disambiguateBestGetAllOptions(ment: Mention, specifiedHeadIdx: Int) = {
-    auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
+    auxDB.purgeDisambiguationAll(
+      redirectsDB.followRedirectsCounter(
+        titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(
+          Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
   }
   
   def disambiguateBestGetAllOptions(query: Query) = {
-    auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(query.getFinalQueryStr))));
+    auxDB.purgeDisambiguationAll(
+      redirectsDB.followRedirectsCounter(
+        titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(
+          Seq(query.getFinalQueryStr))));
   }
   
   def disambiguateBestGetAllReasonableOptions(ment: Mention, specifiedHeadIdx: Int) = {
-    auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllReasonableOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
+    auxDB.purgeDisambiguationAll(
+      redirectsDB.followRedirectsCounter(
+        titleGivenSurfaceDB.disambiguateQueriesGetAllReasonableOptions(
+          Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
   }
   
   def disambiguateBestGetAllOneBestOptions(ment: Mention, specifiedHeadIdx: Int) = {
-    auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOneBestOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
+    auxDB.purgeDisambiguationAll(
+      redirectsDB.followRedirectsCounter(
+        titleGivenSurfaceDB.disambiguateQueriesGetAllOneBestOptions(
+          Query.extractQueriesBest(ment).map(_.getFinalQueryStr))));
   }
   
   def getCategories(title: String) = categoryDB.getCategories(title);

From 88d05a29bb56e0b6f811762748614a043cece83f Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sat, 28 Mar 2015 14:27:07 -0700
Subject: [PATCH 17/25] some changes to trying to generate queries

---
 .../nlp/entity/wiki/JointQueryDenotationChooser.scala       | 5 +++--
 .../edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala   | 6 +++++-
 .../nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala      | 1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 43648c5..e248cbb 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -183,10 +183,11 @@ object JointQueryDenotationChooser {
             //val oldqueries = Query.extractQueriesBest_old(ment, true);
             val queries = Query.extractQueriesBest(ment, true);
             /*if(!(Set(oldqueries.map(_.getFinalQueryStr):_*) subsetOf Set(queries.map(_.getFinalQueryStr):_*))) {
-              println("failed")
+              println("failed...")
             }*/
-            val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_));
+            //val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_));
 //            val denotations = queries.map(wikiDB.disambiguateBestNoDisambig(_));
+            val queryDisambigs = queries.map(wikiDB.disambigRes(_))
             val denotations = Query.extractDenotationSetWithNil(queries, queryDisambigs, maxNumWikificationOptions);
             val correctDenotations = denotations.filter(denotation => isCorrect(goldLabel, denotation))
             // N.B. The use of "isCorrect" here is needed to canonicalize 
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index 88bcdd3..a6ab5b6 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -106,7 +106,11 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
         titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(
           Seq(query.getFinalQueryStr))));
   }
-  
+
+  def disambigRes(query: Query) = {
+    titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(query.getFinalQueryStr))
+  }
+
   def disambiguateBestGetAllReasonableOptions(ment: Mention, specifiedHeadIdx: Int) = {
     auxDB.purgeDisambiguationAll(
       redirectsDB.followRedirectsCounter(
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala
index 2445259..d41605a 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala
@@ -12,6 +12,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 
 // Need to know all titles (including redirects)
+// determins what surfaces values link to with a given count
 @SerialVersionUID(1L)
 class WikipediaTitleGivenSurfaceDB(val surfaceToTitle: CounterMap[String,String]) extends Serializable {
   val truecaseMap = new HashMap[String,ArrayBuffer[String]];

From b8a9e17707475af2f15cfb59a3c7d34dddb1f778 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Wed, 1 Apr 2015 10:31:28 -0700
Subject: [PATCH 18/25] lot more queries being generated, but about 1/3 as many
 impossible queries

---
 build.sbt                                     |   9 +-
 .../wiki/JointQueryDenotationChooser.scala    |   4 +-
 .../edu/berkeley/nlp/entity/wiki/Query.scala  |  15 ++-
 .../nlp/entity/wiki/WikipediaInterface.scala  |  26 +++-
 .../entity/wiki/WikipediaInterface_db.scala   | 127 ++++++++++++++++++
 .../wiki/WikipediaTitleGivenSurfaceDB.scala   |   8 +-
 6 files changed, 176 insertions(+), 13 deletions(-)
 create mode 100644 src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala

diff --git a/build.sbt b/build.sbt
index a3fe7b7..cbfa110 100644
--- a/build.sbt
+++ b/build.sbt
@@ -4,10 +4,17 @@ name := "berkeley-entity"
 
 version := "1"
 
-scalaVersion := "2.11.2"
+scalaVersion := "2.11.6"
 
 assemblySettings
 
 mainClass in assembly := Some("edu.berkeley.nlp.entity.Driver")
 
 unmanagedResourceDirectories in Compile += { baseDirectory.value / "resources/" }
+
+libraryDependencies ++= Seq(
+  "org.scalikejdbc" %% "scalikejdbc"       % "2.2.5",
+  "com.h2database"  %  "h2"                % "1.4.186",
+  "ch.qos.logback"  %  "logback-classic"   % "1.1.2",
+  "org.postgresql"  %  "postgresql"        % "9.4-1201-jdbc41"
+)
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index e248cbb..a2bc630 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -196,9 +196,9 @@ object JointQueryDenotationChooser {
             if (filterImpossible && correctIndices.isEmpty) {
               numImpossible += 1;
               println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations)
-              if(goldLabel.contains("Lord_Speaker")) {
+              /*if(goldLabel.contains("Lord_Speaker")) {
                 println("wtfwtf")
-              }
+              }*/
             } else {
               exs += new JointQueryDenotationExample(queries, denotations, correctDenotations, goldLabel)
             }
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
index e7f6c56..71a1869 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala
@@ -100,6 +100,7 @@ object Query {
       if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) {
         queriesThisSlice += new Query(Seq(wikiCase(firstWord)) ++ mentWords.slice(indices._1 + 1, indices._2), ment, indices, "WIKICASED", RemovePuncFromQuery);
       }
+
       // Stemming (but only on head alone)
       if (PluralQueryExpand && (indices._2 - indices._1) == 1 && firstWord.last == 's') {
         queriesThisSlice ++= queriesThisSlice.map(query => new Query(Seq(removePlural(query.words(0))), ment, indices, query.queryType + "-STEM", RemovePuncFromQuery));
@@ -124,18 +125,23 @@ object Query {
       var feats = featsi // gaaaaa
       val thisSlice = new ArrayBuffer[Query]()
       val wrds = mentWords.slice(start, end)
-      thisSlice += new Query(wrds, ment, (start, end), "STD", RemovePuncFromQuery, feats)
+      thisSlice += new Query(wrds, ment, (start, end), "STD", true, feats ++ List("RemovedPunc"))
+      thisSlice += new Query(wrds, ment, (start, end), "STD", false, feats ++ List("IncludePunc"))
       val firstWord = wrds(0)
       val lastWord = wrds(wrds.size - 1)
       if((end - start)== 1)
         feats ++= List("SingleItemQuery")
       if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) {
-        thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", RemovePuncFromQuery, feats);
+        thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", true, feats ++ List("RemovedPunc"));
+        thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", false, feats ++ List("IncludePunc"));
       }
       // Stemming (but only on head alone)
       if (PluralQueryExpand && (end - start) == 1 && firstWord.last == 's') {
         thisSlice ++= thisSlice.map(qu =>
-          new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", RemovePuncFromQuery, feats));
+          new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", true, feats ++ List("RemovedPunc")));
+        thisSlice ++= thisSlice.map(qu =>
+          new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", false, feats ++ List("IncludePunc")));
+
       }
       queries ++= thisSlice
     }
@@ -151,7 +157,8 @@ object Query {
     val filterWords = mentWords.filter(!isBlacklisted(_, 0))
     if(filterWords.size != mentWords.size) {
       // we lost something, make new query
-      queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", RemovePuncFromQuery, List("FilteredQuery"))
+      queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", true , List("FilteredQuery", "RemovedPunc"))
+      queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", false, List("FilteredQuery", "IncludePunc"))
     }
     queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]())
   }
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index a6ab5b6..bfe6325 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -74,7 +74,9 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
   def disambiguate(ment: Mention) = disambiguateBest(ment, ment.headIdx)
   
   def disambiguateBest(ment: Mention, specifiedHeadIdx: Int) = {
-    redirectsDB.followRedirect(titleGivenSurfaceDB.disambiguateQueries(Query.extractQueriesBest(ment).map(_.getFinalQueryStr)));
+    redirectsDB.followRedirect(
+      titleGivenSurfaceDB.disambiguateQueries(
+        Query.extractQueriesBest(ment).map(_.getFinalQueryStr)));
   }
   
   def disambiguateBestNoDisambig(query: Query) = {
@@ -107,10 +109,26 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
           Seq(query.getFinalQueryStr))));
   }
 
+  def merge[T](a: Counter[T], b: Counter[T]) = {
+    for(k <- a.keySet().asScala) {
+      b.incrementCount(k, a.getCount(k))
+    }
+  }
+
   def disambigRes(query: Query) = {
-    titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(query.getFinalQueryStr))
+    val str = query.getFinalQueryStr
+    var titles = titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(str))
+    titles.incrementCount(str, 1.0)
+    var redirs = redirectsDB.followRedirectsCounter(titles)
+    merge(titles, redirs)
+    //var aux = auxDB.purgeDisambiguationAll(redirs)
+    //merge(redirs, aux)
+    //aux
+    redirs
   }
 
+
+
   def disambiguateBestGetAllReasonableOptions(ment: Mention, specifiedHeadIdx: Int) = {
     auxDB.purgeDisambiguationAll(
       redirectsDB.followRedirectsCounter(
@@ -264,7 +282,9 @@ object WikipediaInterface {
 //    val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet;
 
     // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents.
-    val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr)).toSet;
+    val queries = corefDocs.flatMap(_.predMentions/*.filter(!_.mentionType.isClosedClass)*/)
+      .flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr))
+      .toSet;
     Logger.logss("Extracted " + queries.size + " queries from " + corefDocs.size + " documents");
     val interface = if (WikipediaInterface.categoryDBInputPath != "") {
       val categoryDB = GUtil.load(WikipediaInterface.categoryDBInputPath).asInstanceOf[WikipediaCategoryDB];
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala
new file mode 100644
index 0000000..6183546
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala
@@ -0,0 +1,127 @@
+package edu.berkeley.nlp.entity.wiki
+
+import edu.berkeley.nlp.entity.GUtil
+import edu.berkeley.nlp.futile.LightRunner
+import edu.berkeley.nlp.futile.fig.basic.Indexer
+import edu.berkeley.nlp.futile.util.CounterMap
+import scalikejdbc._
+
+import scala.collection.mutable
+
+/**
+ * Created by matthewfl
+ */
+class WikipediaInterface_db (conn : String) {
+
+
+  Class.forName("org.postgresql.Driver")
+  val settings = ConnectionPoolSettings(
+    initialSize = 1,
+    maxSize = 8,
+    connectionTimeoutMillis = 3000L,
+    validationQuery = "select 1")
+  ConnectionPool.add(this, conn, "wiki", "wiki", settings)
+
+  /*val i1 : Option[Int] = using(DB(ConnectionPool.borrow(this))) { db =>
+    db localTx { implicit session =>
+      SQL("select 5 as i").map(r=>r.get[Int](1)).single.apply()
+    }
+  }
+
+  println("value of il: "+i1.get)
+*/
+
+
+  def disambigRes(query: Query) = {
+    Seq[String]()
+  }
+
+  def TitlesGivenSurface = {
+    var m = new CounterMap[String, String]()
+    using(DB(ConnectionPool.borrow(this))) { db => {
+      db localTx { implicit session => {
+        SQL("select surface_text, page_title, count(*) as cnt from links inner join page on page_latest = to_id group by surface_text, page_title")
+          .fetchSize(5000)
+          .foreach(res => {
+          m.incrementCount(res.string("surface_text"), res.string("page_title"), res.int("cnt"))
+        })
+      }}
+    }}
+    new WikipediaTitleGivenSurfaceDB(m)
+  }
+
+  def Redirects = {
+    val m = new mutable.HashMap[String,String]()
+    using(DB(ConnectionPool.borrow(this))) { db =>
+      db localTx { implicit session => {
+        SQL(
+          """select pf.page_title as from_page, pt.page_title as to_page
+             from page pf inner join links on links.from_id = pf.page_latest
+             inner join page pt on links.to_id = pt.page_latest
+             where pf.page_is_redirect = 1 limit 10000"""
+        ).fetchSize(5000)
+          .foreach(res => {
+          println("loading redirect "+res.string("from_page"))
+          m += (res.string("from_page") -> res.string("to_page"))
+        })
+      }}
+    }
+    new WikipediaRedirectsDB(m)
+  }
+
+  def Links = {
+    // TODO:
+    val ind = new Indexer[String]()
+
+    null.asInstanceOf[WikipediaLinkDB]
+  }
+
+  def Aux = {
+    null.asInstanceOf[WikipediaAuxDB]
+  }
+
+}
+
+
+object WikipediaInterface_db {
+
+  // database connection string
+  val conn = "jdbc:postgresql://10.7.0.17/wiki"
+
+  // most stuff should come out of the db
+  val wikipediaPath = ""
+
+  val categoryDBInputPath = ""
+  val categoryDBOutputPath = ""
+
+  val outputPath = ""
+
+  def main(args : Array[String]): Unit = {
+
+    LightRunner.initializeOutput(WikipediaInterface_db.getClass);
+    LightRunner.populateScala(WikipediaInterface_db.getClass, args);
+
+
+    var db = new WikipediaInterface_db(conn)
+
+    val catDB = if(!categoryDBInputPath.isEmpty) {
+      GUtil.load(categoryDBInputPath).asInstanceOf[WikipediaCategoryDB]
+    } else {
+      // this is really slow to make the cat database, you should want to avoid this
+      assert(false)
+      null.asInstanceOf[WikipediaCategoryDB]
+    }
+
+    val wi = new WikipediaInterface(db.TitlesGivenSurface, db.Redirects, catDB, db.Links, db.Aux)
+
+    GUtil.save(wi, outputPath)
+
+    if (categoryDBOutputPath != "") {
+      GUtil.save(catDB, categoryDBOutputPath);
+    }
+    LightRunner.finalizeOutput();
+
+    // going to punt on the links db, as it appears that it is not being used
+
+  }
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala
index d41605a..deaf7d4 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala
@@ -90,7 +90,9 @@ object WikipediaTitleGivenSurfaceDB {
   val PuncList = Set(',', '.', '!', '?', ':', ';', '\'', '"', '(', ')', '[', ']', '{', '}', ' ');
 
   def isGoodTitle(str: String) = !str.contains("#") && !str.contains(":") && !str.contains("Wikipedia") && !str.startsWith("List of") && !str.startsWith("List_of");
-  
+
+  // this is using the set of generated queries to determine which are the best items to extract
+  // / limit the size of the extracted wiki data
   def processWikipedia(wikipediaPath: String, querySet: Set[String]): WikipediaTitleGivenSurfaceDB = {
     val lowercase = false;
     val surfaceToTitle = new CounterMap[String,String];
@@ -130,13 +132,13 @@ object WikipediaTitleGivenSurfaceDB {
     Logger.logss(querySet.size + " queries, " + counter + " lines processed, " + surfaceToTitle.size + " surface strings found, " +
                  surfaceToTitle.totalCount + " total count");
     // .toSeq here to avoid a ConcurrentModificationException
-    for (key <- surfaceToTitle.keySet.asScala.toSeq) {
+    /*for (key <- surfaceToTitle.keySet.asScala.toSeq) {
       surfaceToTitle.getCounter(key).pruneKeysBelowThreshold(1.5);
       surfaceToTitle.getCounter(key).removeKey("");
       if (surfaceToTitle.getCounter(key).isEmpty) {
         surfaceToTitle.removeKey(key);
       }
-    }
+    }*/
     new WikipediaTitleGivenSurfaceDB(surfaceToTitle);
   }
   

From 4ce0c436fb19d959c96a990d6f16a497489089f7 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Fri, 3 Apr 2015 09:33:56 -0700
Subject: [PATCH 19/25] attempt to include gold data when extracting useful
 components from wikipedia

---
 .../nlp/entity/wiki/WikipediaInterface.scala     | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index bfe6325..f61df88 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -193,6 +193,8 @@ object WikipediaInterface {
   
   val categoryDBInputPath = "";
   val categoryDBOutputPath = "";
+
+  val wikiStandoff = "";
   
   def processWikipedia(wikipediaPath: String, queries: Set[String], parser: CoarseToFineMaxRuleParser, backoffParser: CoarseToFineMaxRuleParser): WikipediaInterface = {
     val titleGivenSurface = WikipediaTitleGivenSurfaceDB.processWikipedia(wikipediaPath, queries);
@@ -282,9 +284,21 @@ object WikipediaInterface {
 //    val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet;
 
     // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents.
-    val queries = corefDocs.flatMap(_.predMentions/*.filter(!_.mentionType.isClosedClass)*/)
+    var queries = corefDocs.flatMap(_.predMentions/*.filter(!_.mentionType.isClosedClass)*/)
       .flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr))
       .toSet;
+    // some of the gold titles in the older dataset link to current redirect pages
+    // so we are loading them here so we can normalize the redirects when performing training/testing
+    val golds : Set[String] = if(!wikiStandoff.isEmpty) {
+      WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiStandoff).flatMap(d => {
+        d._2.flatMap(v => {
+          v._2.flatMap(_.label).map(_.replace("_"," "))
+        })
+      }).toSet
+    } else {
+      Set[String]()
+    }
+    queries = queries ++ golds
     Logger.logss("Extracted " + queries.size + " queries from " + corefDocs.size + " documents");
     val interface = if (WikipediaInterface.categoryDBInputPath != "") {
       val categoryDB = GUtil.load(WikipediaInterface.categoryDBInputPath).asInstanceOf[WikipediaCategoryDB];

From 9f3752e632054d6edd7f0f601426fff5c5b6328d Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Thu, 9 Apr 2015 11:21:23 -0700
Subject: [PATCH 20/25] better printing, and fixes to links db

---
 .../wiki/JointQueryDenotationChooser.scala    | 59 ++++++++++++----
 .../nlp/entity/wiki/WikipediaInterface.scala  |  4 +-
 .../nlp/entity/wiki/WikipediaLinkDB.scala     | 70 ++++++++++++-------
 3 files changed, 90 insertions(+), 43 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index a2bc630..6c4bbd4 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -143,19 +143,40 @@ class JointQueryDenotationChooser(val featureIndexer: Indexer[String],
     computer.computeDenotation(ex, weights)
   }*/
 
-  def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : Seq[String] = {
+  def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : (Seq[(String, Int)], Array[Array[Int]]) = {
     val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
     val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
     val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions)
     val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]());
     val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
 
-    ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse.map(_._1)
+    (ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse,
+      ex.cachedFeatsEachQuery)
   }
 
-  def diffFeatures(correct: Query, choosen: Query, wikiDB: WikipediaInterface) = {
+  def printEverything(queries: Seq[Query], wikiDB: WikipediaInterface, correctInd: Int) = {
+    // just redo the computations so gg
+    val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer);
+    val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
+    val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions)
+    val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]());
+    val denotationMarginals = computer.getDenotationLogMarginals(ex, weights)
 
+    val sortedItms = ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse
+
+    println(
+      s"""Correct item in $correctInd (${sortedItms(correctInd)._1})
+         |\tGuessed value: ${sortedItms(0)._1}""".stripMargin)
+    for(i <- 0 until queries.length) {
+      println("\t\t"+i+": "+queries(i))
+      println("\t\t"+ex.cachedFeatsEachQuery(i).map(featureIndexer.getObject(_)).mkString(" "))
+      for(j <- 0 until ex.allDenotations.length) {
+        println("\t\t\t"+j+": "+ex.allDenotations(j)+": "+ex.cachedFeatsEachQueryDenotation(i)(j).map(featureIndexer.getObject(_)).mkString(" "))
+      }
+    }
+    println()
   }
+
 }
 
 object JointQueryDenotationChooser {
@@ -195,7 +216,7 @@ object JointQueryDenotationChooser {
 //            if (correctIndices.isEmpty && 
             if (filterImpossible && correctIndices.isEmpty) {
               numImpossible += 1;
-              println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations)
+              //println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations)
               /*if(goldLabel.contains("Lord_Speaker")) {
                 println("wtfwtf")
               }*/
@@ -286,8 +307,8 @@ object JointQueryDenotationChooser {
 
     val results = testExs.map(t => {
       // TODO: need more then one perdicted title
-      val picks = chooser.pickDenotations(t.queries, wikiDB)
-      if(!isCorrect(t.rawCorrectDenotations, picks(0))) {
+      val (picks, denFeats) = chooser.pickDenotations(t.queries, wikiDB)
+      if(!isCorrect(t.rawCorrectDenotations, picks(0)._1)) {
         // the pick is not correct, attempt to determine if there would have
         // been a better pick that is in the picks list (which basically means all of the
         /*if(picks.size > 1 && isCorrect(t.rawCorrectDenotations, picks(1))) {
@@ -296,21 +317,31 @@ object JointQueryDenotationChooser {
           println("second pick was correct")
 
         }*/
-        var qq = false
-        for((p, i) <- picks.drop(1).zipWithIndex) {
+        var qq = -1
+        for((p, i) <- picks.zipWithIndex) {
           // try: t.correctDenotations here?
-          if(isCorrect(t.correctDenotations, p) || isCorrect(t.rawCorrectDenotations, p)) {
-            println("Found correct item with "+i)
+          if(isCorrect(t.correctDenotations, p._1) || isCorrect(t.rawCorrectDenotations, p._1)) {
+            //println("Found correct item with "+i)
             correctItemWasInSet += 1
-            qq = true
+            qq = i
             //println("found correct item")
           }
         }
-        if(!qq) {
-          println("???")
+        if(qq != -1) {
+          chooser.printEverything(t.queries, wikiDB, qq)
+          /*println(
+            s"""Correct item in place: $qq
+                |\tcorrect value: ${picks(qq)}
+                |\t\t${denFeats(picks(qq)._2).flatMap(featIndexer.getObject(_)).mkString(" ")}
+                |\tchosen value : ${picks(0)}
+                |\t\t${denFeats(picks(0)._2).flatMap(featIndexer.getObject(_)).mkString(" ")}
+              """.stripMargin)
+*/
+        } else {
+          println("THIS QUERY SHOULD HAVE BEEN FILTERED")
         }
       }
-      (t.rawCorrectDenotations, picks, t.queries(0).originalMent.rawDoc)
+      (t.rawCorrectDenotations, picks.map(_._1), t.queries(0).originalMent.rawDoc)
     })
 
     val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1))
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index f61df88..223cabf 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -203,7 +203,7 @@ object WikipediaInterface {
     val links = if (WikipediaInterface.computeLinkDB) {
       WikipediaLinkDB.processWikipedia(wikipediaPath, allPageTargetsLc);
     } else {
-      new WikipediaLinkDB(new Indexer[String], new HashMap[String,Array[Int]], new HashMap[String,Array[Int]]);
+      new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]);
     }
     val categories = WikipediaCategoryDB.processWikipedia(wikipediaPath, allPageTargetsLc, parser, backoffParser);
     val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc);
@@ -219,7 +219,7 @@ object WikipediaInterface {
     val links = if (WikipediaInterface.computeLinkDB) {
       WikipediaLinkDB.processWikipedia(wikipediaPath, allPageTargetsLc);
     } else {
-      new WikipediaLinkDB(new Indexer[String], new HashMap[String,Array[Int]], new HashMap[String,Array[Int]]);
+      new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]);
     }
     val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc);
     val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux);
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
index f2f1f6a..e8800c0 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
@@ -1,6 +1,7 @@
 package edu.berkeley.nlp.entity.wiki
 
 import edu.berkeley.nlp.futile.fig.basic.Indexer
+import scala.collection.mutable
 import scala.collection.mutable.HashMap
 import scala.collection.mutable.ArrayBuffer
 import edu.berkeley.nlp.futile.fig.basic.IOUtils
@@ -14,33 +15,36 @@ import edu.berkeley.nlp.entity.lang.Language
 import edu.berkeley.nlp.entity.wiki._
 
 @SerialVersionUID(9084163557546777842L)
-class WikipediaLinkDB(val pageNameIndex: Indexer[String],
-                      val inLinksMap: HashMap[String,Array[Int]],
-                      val outLinksMap: HashMap[String,Array[Int]]) extends Serializable {
-  var outLinksSetCache: HashMap[String,Set[Int]] = null;
+class WikipediaLinkDB(private val pageNameIndex: Indexer[String],
+                      private val inLinksMap: HashMap[Int,Array[Int]],
+                      private val outLinksMap: HashMap[Int,Array[Int]]) extends Serializable {
+  @transient
+  var outLinksSetCache = new mutable.HashMap[String,Set[Int]]()
   
   def getOutLinks(title: String) = {
-    if (outLinksMap.contains(title)) {
-      outLinksMap(title);
+    val k = pageNameIndex.indexOf(title)
+    if (outLinksMap.contains(k)) {
+      outLinksMap(k);
     } else {
       Array[Int]();
     }
   }
   
-  def getOutLinksSetUseCache(title: String) = {
-    if (outLinksMap.contains(title)) {
-      if (outLinksSetCache == null) {
-        outLinksSetCache = new HashMap[String,Set[Int]];
-      }
-      if (!outLinksSetCache.contains(title)) {
+  def getOutLinksSetUseCache(title: String) : Set[Int] = {
+    if(outLinksSetCache.contains(title)) {
+      outLinksSetCache(title)
+    } else {
+      val k = pageNameIndex.indexOf(title)
+      if(k != -1) {
         if (outLinksSetCache.size > 1000) {
           outLinksSetCache.dropRight(1);
         }
-        outLinksSetCache.put(title, outLinksMap(title).toSet);
+        val s = outLinksMap(k).toSet
+        outLinksSetCache.put(title, s)
+        s
+      } else {
+        Set[Int]()
       }
-      outLinksSetCache(title);
-    } else {
-      Set[Int]();
     }
   }
   
@@ -56,9 +60,11 @@ class WikipediaLinkDB(val pageNameIndex: Indexer[String],
   }
   
   def doesOneLinkToOther(title1: String, title2: String): Boolean = {
+    val ti1 = pageNameIndex.indexOf(title1)
+    val ti2 = pageNameIndex.indexOf(title2)
     val outLinksTitle1 = getOutLinks(title1);
     val outLinksTitle2 = getOutLinks(title2);
-    outLinksTitle1.contains(pageNameIndex.indexOf(title2)) || outLinksTitle2.contains(pageNameIndex.indexOf(title1)) 
+    outLinksTitle1.contains(ti2) || outLinksTitle2.contains(ti1)
   }
 }
 
@@ -66,18 +72,19 @@ object WikipediaLinkDB {
   
   def processWikipedia(wikipediaPath: String, pageTitleSetLc: Set[String]): WikipediaLinkDB = {
     val pageNamesIndex = new Indexer[String];
-    val inLinksMap = new HashMap[String,HashSet[Int]];
-    val outLinksMap = new HashMap[String,HashSet[Int]];
+    val inLinksMap = new HashMap[Int,HashSet[Int]];
+    val outLinksMap = new HashMap[Int,HashSet[Int]];
     val lines = IOUtils.lineIterator(IOUtils.openInHard(wikipediaPath));
     var currentPageTitle = "";
-    var linksThisPage = new StringBuilder();
+    var currentPageTitleind = 0
+    //var linksThisPage = new StringBuilder();
     var doneWithThisPage = false;
     var numPagesSeen = 0;
     var lineIdx = 0;
-    var isInText = false;
-    val categoryMap = new HashMap[String,ArrayBuffer[String]];
-    val infoboxMap = new HashMap[String,String];
-    val appositiveMap = new HashMap[String,String];
+    //var isInText = false;
+    //val categoryMap = new HashMap[String,ArrayBuffer[String]];
+    //val infoboxMap = new HashMap[String,String];
+    //val appositiveMap = new HashMap[String,String];
     // Extract first line that's not in brackets
     while (lines.hasNext) {
       val line = lines.next;
@@ -96,6 +103,7 @@ object WikipediaLinkDB {
         } else if (line.contains("<title>")) {
           // 7 = "<title>".length()
           currentPageTitle = line.substring(line.indexOf("<title>") + 7, line.indexOf("</title>"));
+          currentPageTitleind = pageNamesIndex.getIndex(currentPageTitle)
           if (!pageTitleSetLc.contains(currentPageTitle.toLowerCase)) {
             doneWithThisPage = true;
           }
@@ -115,15 +123,23 @@ object WikipediaLinkDB {
           }
           if (linkDest != "") {
             val idx = pageNamesIndex.getIndex(linkDest);
-            if (!outLinksMap.contains(currentPageTitle)) {
-              outLinksMap.put(currentPageTitle, new HashSet[Int]);
+            if (!outLinksMap.contains(currentPageTitleind)) {
+              outLinksMap.put(currentPageTitleind, new HashSet[Int]);
             }
-            outLinksMap(currentPageTitle) += idx;
+            outLinksMap(currentPageTitleind) += idx;
           }
           startIdx = line.indexOf("[[", startIdx + 2);
         }
       }
     }
+    outLinksMap.foreach(a => {
+      a._2.foreach(b => {
+        if(!inLinksMap.contains(b)) {
+          inLinksMap.put(b, new mutable.HashSet[Int])
+        }
+        inLinksMap(b) += a._1
+      })
+    })
     val inLinksMapArrs = inLinksMap.map(entry => entry._1 -> entry._2.toArray); // TODO: WTF: inlinksmap is never written to
     val outLinksMapArrs = outLinksMap.map(entry => entry._1 -> entry._2.toArray);
     val sizes = Array.tabulate(10)(i => 0);

From 83ad9542815900e3455d6cad57019a2669bb5d53 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sun, 12 Apr 2015 22:32:27 -0700
Subject: [PATCH 21/25] failed attempt to simply add global wikification
 features, going to need a classifier that is aware about other choices in the
 document

---
 .../wiki/JointQueryDenotationChooser.scala    |   2 +-
 .../nlp/entity/wiki/QueryChooser.scala        | 150 ++++++++++++++++++
 .../nlp/entity/wiki/WikipediaLinkDB.scala     |  52 +++++-
 .../entity/wiki/WikipediaRedirectsDB.scala    |   1 +
 4 files changed, 199 insertions(+), 6 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 6c4bbd4..9b206d5 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -44,7 +44,7 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
   def featurizeUseCache(ex: JointQueryDenotationExample, addToIndexer: Boolean) {
     if (ex.cachedFeatsEachQuery == null) {
       ex.cachedFeatsEachQuery = queryChooser.featurizeQueries(ex.queries, addToIndexer)
-      ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations(ex.queries, ex.allDenotations, addToIndexer)
+      ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations_GLOW(ex.queries, ex.allDenotations, addToIndexer, wikiDB)
     }
   }
   
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
index 90538cb..de02806 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
@@ -18,6 +18,8 @@ import edu.berkeley.nlp.entity.ConllDocReader
 import edu.berkeley.nlp.entity.coref.CorefDocAssembler
 import edu.berkeley.nlp.entity.coref.MentionPropertyComputer
 
+import scala.collection.mutable
+
 case class QueryChoiceExample(val queries: Seq[Query],
                               val denotations: Seq[String],
                               val correctQueryIndices: Array[Int]) {
@@ -128,6 +130,154 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
       feats.toArray;
     });
   }
+
+  def getDentationLinksSets(denotations: Seq[String], wikiDB: WikipediaInterface) : (Seq[Set[Int]], Seq[Set[Int]]) = {
+    (denotations.map(wikiDB.linksDB.getInLinksSetUseCache(_)), denotations.map(wikiDB.linksDB.getOutLinksSetUseCache(_)))
+  }
+
+  val logsv = (0 until 3000).map(Math.log(_))
+
+  def logs(i: Int) = {
+    if(i < logsv.size)
+      logsv(i)
+    else
+      Math.log(i)
+  }
+
+  def unionSize[T](ss: Set[T]*) = {
+    val ns = new mutable.HashSet[T]()
+    for(s <- ss) {
+      ns ++= s
+    }
+    ns.size
+  }
+
+  def intersectSize[T](a: Set[T], b: Set[T]) = {
+    var smaller: Set[T] = a
+    var larger: Set[T] = b
+    if(a.size > b.size) {
+      larger = a
+      smaller = b
+    }
+    var ret = 0
+    for(i <- smaller) {
+      if(larger.contains(i))
+        ret += 1
+    }
+    ret
+  }
+
+  def NGD[T](a: Set[T], b: Set[T], wsize: Int) : Double = {
+    (logs(math.max(a.size, b.size)) - logs(intersectSize(a,b))) /
+      (logs(wsize) - logs(math.min(a.size,b.size)))
+  }
+
+  def PMI[T](a: Set[T], b: Set[T], wsize: Int) : Double = {
+    // TODO: ? the use of wsize here does not make since
+    // must be misunderstanding something
+    (intersectSize(a,b) * wsize).asInstanceOf[Float] / (a.size * b.size)
+  }
+
+  def GLOWfeatures[T](fn: (Set[T], Set[T], Int) => Double, refs: Seq[Set[T]], prefix: String): Seq[Array[String]] = {
+    val rsize = refs.size
+    val wsize = unionSize(refs:_*)
+    var max = Double.NegativeInfinity
+    var avg = 0.0
+    // TODO: rank the items in the list
+    //val valList = new mutable.MutableList[Double]()
+    val cache = new mutable.HashMap[Int,Double] {
+      override def initialSize: Int = rsize*rsize
+    }
+    for(a <- 0 until rsize; b <- 0 until rsize) {
+      if(a != b) {
+        val v = fn(refs(a), refs(b), wsize)
+        cache.put(a + b*65536, v)
+        if(v > max)
+          max = v
+        //valList += v
+        avg += v
+      }
+    }
+    avg /= (rsize * (rsize - 1))
+    for(a <- 0 until rsize) yield {
+      var isInMax = false
+      var isAboveAvg = false
+      var isAboveAvg2 = false
+      for(b <- 0 until rsize) {
+        if(a != b) {
+          //val v = fn(refs(a),refs(b),wsize)
+          val v : Double = cache.getOrElse(a + b*65536, 0.0)
+          if(v == max) {
+            isInMax = true
+          }
+          if(v > avg) {
+            isAboveAvg = true
+          }
+          if(v > (avg * 2)) {
+            isAboveAvg2 = true
+          }
+        }
+      }
+      val r = new ArrayBuffer[String]
+      if(isInMax)
+        r += prefix + "IsInMax"
+      if(isAboveAvg)
+        r += prefix + "isAboveAvg"
+      if(isAboveAvg2)
+        r += prefix + "isAboveAvg2"
+      r.toArray
+    }
+  }
+
+  def featurizeQueriesAndDenotations_GLOW(queries: Seq[Query], denotations: Seq[String], addToIndexer: Boolean, wikiDB: WikipediaInterface): Array[Array[Array[Int]]] = {
+    val queryOutcomes = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
+    val queryNonemptyList = queryOutcomes.map(_.isEmpty);
+    val ment = queries.head.originalMent;
+    val mentUpToHeadSize = ment.headIdx - ment.startIdx + 1;
+    val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB)
+
+    val PMINGDvals = Seq(
+      GLOWfeatures[Int](PMI, refLinksIn, "PMI-in-"),
+      GLOWfeatures[Int](NGD, refLinksIn, "NGD-in-"),
+      GLOWfeatures[Int](PMI, refLinksOut, "PMI-out-"),
+      GLOWfeatures[Int](NGD, refLinksOut, "NGD-out-")
+    )
+    // TODO: this is not correct,.....
+
+
+    Array.tabulate(queries.size, denotations.size)((queryIdx, denIdx) => {
+      val feats = new ArrayBuffer[Int];
+      def feat(str: String) = addFeat(str, feats, addToIndexer);
+      for(p <- PMINGDvals)
+        for(f <- p(denIdx))
+          feat(f)
+      val query = queries(queryIdx);
+      val den = denotations(denIdx);
+      if (den == NilToken) {
+        feat("NilAndQueryNonempty=" + queryNonemptyList(queryIdx));
+      } else if (queryOutcomes(queryIdx).containsKey(den)) {
+        val queryDescriptorWithProper = (if (ment.pos(ment.headIdx - ment.startIdx) == "NNP") "PROP" else "NOM") + "-" + query.queryType;
+        val queryRank = queryOutcomes(queryIdx).getSortedKeys().indexOf(den);
+        feat("Rank=" + queryDescriptorWithProper + "-" + (queryRank + 1))
+        val queryStr = query.getFinalQueryStr;
+        val matchesQuery = den.toLowerCase == queryStr.toLowerCase;
+        feat("MatchesQuery=" + queryDescriptorWithProper + "-" + matchesQuery)
+        if (!matchesQuery) {
+          feat("ContainsQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.contains(queryStr.toLowerCase)));
+          feat("StartsWithQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.startsWith(queryStr.toLowerCase)));
+          feat("EndsWithQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.endsWith(queryStr.toLowerCase)));
+        }
+        val denotationHasParenthetical = den.contains("(") && den.endsWith(")");
+        feat("ContainsParenthetical=" + queryDescriptorWithProper + "-" + denotationHasParenthetical);
+        if (denotationHasParenthetical) {
+          feat("MatchesQueryUpToParen=" + queryDescriptorWithProper + "-" + (den.substring(0, den.indexOf("(")).trim.toLowerCase == queryStr.toLowerCase))
+        }
+      } else {
+        feat("Impossible");
+      }
+      feats.toArray;
+    });
+  }
   
   def featurizeQueriesAndDenotations(queries: Seq[Query], denotations: Seq[String], addToIndexer: Boolean): Array[Array[Array[Int]]] = {
     val queryOutcomes = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
index e8800c0..d2b00cb 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala
@@ -19,8 +19,11 @@ class WikipediaLinkDB(private val pageNameIndex: Indexer[String],
                       private val inLinksMap: HashMap[Int,Array[Int]],
                       private val outLinksMap: HashMap[Int,Array[Int]]) extends Serializable {
   @transient
-  var outLinksSetCache = new mutable.HashMap[String,Set[Int]]()
-  
+  private var outLinksSetCache : mutable.HashMap[String,Set[Int]] = null
+
+  @transient
+  private var inLinksSetCache : mutable.HashMap[String,Set[Int]] = null
+
   def getOutLinks(title: String) = {
     val k = pageNameIndex.indexOf(title)
     if (outLinksMap.contains(k)) {
@@ -29,17 +32,51 @@ class WikipediaLinkDB(private val pageNameIndex: Indexer[String],
       Array[Int]();
     }
   }
-  
+
+  def getInLinks(title: String) = {
+    val k = pageNameIndex.indexOf(title)
+    if(inLinksMap.contains(k)) {
+      inLinksMap(k)
+    } else {
+      Array[Int]()
+    }
+  }
+
+  def getInLinksSetUseCache(title: String) : Set[Int] = {
+    if(inLinksSetCache == null) {
+      inLinksSetCache = new mutable.HashMap[String,Set[Int]]()
+    }
+    if(inLinksSetCache.contains(title)) {
+      inLinksSetCache(title)
+    } else {
+      val k = pageNameIndex.indexOf(title)
+      if(k != -1) {
+        if (inLinksSetCache.size > 1000) {
+          inLinksSetCache = new mutable.HashMap[String,Set[Int]]()
+        }
+        val s = inLinksMap.getOrElse(k, Array[Int]()).toSet
+        inLinksSetCache.put(title, s)
+        s
+      } else {
+        Set[Int]()
+      }
+    }
+  }
+
   def getOutLinksSetUseCache(title: String) : Set[Int] = {
+    if(outLinksSetCache == null) {
+      outLinksSetCache = new mutable.HashMap[String,Set[Int]]()
+    }
     if(outLinksSetCache.contains(title)) {
       outLinksSetCache(title)
     } else {
       val k = pageNameIndex.indexOf(title)
       if(k != -1) {
         if (outLinksSetCache.size > 1000) {
-          outLinksSetCache.dropRight(1);
+          // dropping one item was taking too long
+          outLinksSetCache = new mutable.HashMap[String,Set[Int]]()
         }
-        val s = outLinksMap(k).toSet
+        val s = outLinksMap.getOrElse(k, Array[Int]()).toSet
         outLinksSetCache.put(title, s)
         s
       } else {
@@ -108,6 +145,11 @@ object WikipediaLinkDB {
             doneWithThisPage = true;
           }
         } else if (line.contains("<redirect title")) {
+          val linkDest = line.substring(line.indexOf("title=\"") + 7, line.indexOf("\" />"))
+          val idx = pageNamesIndex.getIndex(linkDest)
+          val hs = new HashSet[Int]
+          hs.add(idx)
+          outLinksMap.put(currentPageTitleind, hs)
           doneWithThisPage = true;
         }
         var startIdx = line.indexOf("[[");
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala
index 654f8fa..dbc4e88 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala
@@ -48,6 +48,7 @@ object WikipediaRedirectsDB {
   val CapitalizeInitial = true;
   
   def removeWeirdMarkup(str: String) = {
+    // TODO: this is a slow method, don't use
     str.replace("&#039;", "'");
   }
   

From 5b5a5d7ba50ec6037e7350256049c00d53f81106 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sun, 12 Apr 2015 22:35:06 -0700
Subject: [PATCH 22/25] remove unused sql attempt

---
 build.sbt                                     |   7 -
 .../entity/wiki/WikipediaInterface_db.scala   | 127 ------------------
 2 files changed, 134 deletions(-)
 delete mode 100644 src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala

diff --git a/build.sbt b/build.sbt
index cbfa110..77738fd 100644
--- a/build.sbt
+++ b/build.sbt
@@ -11,10 +11,3 @@ assemblySettings
 mainClass in assembly := Some("edu.berkeley.nlp.entity.Driver")
 
 unmanagedResourceDirectories in Compile += { baseDirectory.value / "resources/" }
-
-libraryDependencies ++= Seq(
-  "org.scalikejdbc" %% "scalikejdbc"       % "2.2.5",
-  "com.h2database"  %  "h2"                % "1.4.186",
-  "ch.qos.logback"  %  "logback-classic"   % "1.1.2",
-  "org.postgresql"  %  "postgresql"        % "9.4-1201-jdbc41"
-)
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala
deleted file mode 100644
index 6183546..0000000
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala
+++ /dev/null
@@ -1,127 +0,0 @@
-package edu.berkeley.nlp.entity.wiki
-
-import edu.berkeley.nlp.entity.GUtil
-import edu.berkeley.nlp.futile.LightRunner
-import edu.berkeley.nlp.futile.fig.basic.Indexer
-import edu.berkeley.nlp.futile.util.CounterMap
-import scalikejdbc._
-
-import scala.collection.mutable
-
-/**
- * Created by matthewfl
- */
-class WikipediaInterface_db (conn : String) {
-
-
-  Class.forName("org.postgresql.Driver")
-  val settings = ConnectionPoolSettings(
-    initialSize = 1,
-    maxSize = 8,
-    connectionTimeoutMillis = 3000L,
-    validationQuery = "select 1")
-  ConnectionPool.add(this, conn, "wiki", "wiki", settings)
-
-  /*val i1 : Option[Int] = using(DB(ConnectionPool.borrow(this))) { db =>
-    db localTx { implicit session =>
-      SQL("select 5 as i").map(r=>r.get[Int](1)).single.apply()
-    }
-  }
-
-  println("value of il: "+i1.get)
-*/
-
-
-  def disambigRes(query: Query) = {
-    Seq[String]()
-  }
-
-  def TitlesGivenSurface = {
-    var m = new CounterMap[String, String]()
-    using(DB(ConnectionPool.borrow(this))) { db => {
-      db localTx { implicit session => {
-        SQL("select surface_text, page_title, count(*) as cnt from links inner join page on page_latest = to_id group by surface_text, page_title")
-          .fetchSize(5000)
-          .foreach(res => {
-          m.incrementCount(res.string("surface_text"), res.string("page_title"), res.int("cnt"))
-        })
-      }}
-    }}
-    new WikipediaTitleGivenSurfaceDB(m)
-  }
-
-  def Redirects = {
-    val m = new mutable.HashMap[String,String]()
-    using(DB(ConnectionPool.borrow(this))) { db =>
-      db localTx { implicit session => {
-        SQL(
-          """select pf.page_title as from_page, pt.page_title as to_page
-             from page pf inner join links on links.from_id = pf.page_latest
-             inner join page pt on links.to_id = pt.page_latest
-             where pf.page_is_redirect = 1 limit 10000"""
-        ).fetchSize(5000)
-          .foreach(res => {
-          println("loading redirect "+res.string("from_page"))
-          m += (res.string("from_page") -> res.string("to_page"))
-        })
-      }}
-    }
-    new WikipediaRedirectsDB(m)
-  }
-
-  def Links = {
-    // TODO:
-    val ind = new Indexer[String]()
-
-    null.asInstanceOf[WikipediaLinkDB]
-  }
-
-  def Aux = {
-    null.asInstanceOf[WikipediaAuxDB]
-  }
-
-}
-
-
-object WikipediaInterface_db {
-
-  // database connection string
-  val conn = "jdbc:postgresql://10.7.0.17/wiki"
-
-  // most stuff should come out of the db
-  val wikipediaPath = ""
-
-  val categoryDBInputPath = ""
-  val categoryDBOutputPath = ""
-
-  val outputPath = ""
-
-  def main(args : Array[String]): Unit = {
-
-    LightRunner.initializeOutput(WikipediaInterface_db.getClass);
-    LightRunner.populateScala(WikipediaInterface_db.getClass, args);
-
-
-    var db = new WikipediaInterface_db(conn)
-
-    val catDB = if(!categoryDBInputPath.isEmpty) {
-      GUtil.load(categoryDBInputPath).asInstanceOf[WikipediaCategoryDB]
-    } else {
-      // this is really slow to make the cat database, you should want to avoid this
-      assert(false)
-      null.asInstanceOf[WikipediaCategoryDB]
-    }
-
-    val wi = new WikipediaInterface(db.TitlesGivenSurface, db.Redirects, catDB, db.Links, db.Aux)
-
-    GUtil.save(wi, outputPath)
-
-    if (categoryDBOutputPath != "") {
-      GUtil.save(catDB, categoryDBOutputPath);
-    }
-    LightRunner.finalizeOutput();
-
-    // going to punt on the links db, as it appears that it is not being used
-
-  }
-}
\ No newline at end of file

From b577da33a8ff64fe2b92f2f641ad4b681b092ddc Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Tue, 14 Apr 2015 00:00:47 -0700
Subject: [PATCH 23/25] text db for getting some bow features from documents

---
 .../entity/wiki/DocumentedSetChooser.scala    | 42 +++++++++
 .../wiki/JointQueryDenotationChooser.scala    |  4 +-
 .../nlp/entity/wiki/QueryChooser.scala        | 25 ++++-
 .../nlp/entity/wiki/WikipediaInterface.scala  |  9 +-
 .../nlp/entity/wiki/WikipediaTextDB.scala     | 91 +++++++++++++++++++
 5 files changed, 162 insertions(+), 9 deletions(-)
 create mode 100644 src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala
 create mode 100644 src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala
new file mode 100644
index 0000000..acd575e
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala
@@ -0,0 +1,42 @@
+package edu.berkeley.nlp.entity.wiki
+
+import edu.berkeley.nlp.futile.LightRunner
+
+/**
+ * Created by matthewfl
+ *
+ * We want to work with the who document at a time rather then just a single link
+ * this will allow us to
+ */
+class DocumentedSetChooser {
+
+}
+
+
+object DocumentedSetChooser {
+
+  val trainDataPath = "data/ace05/train";
+  val testDataPath = "data/ace05/dev";
+  val wikiPath = "data/ace05/ace05-all-conll-wiki" // contains the wiki links for both items
+  val wikiDBPath = "models/wiki-db-ace.ser.gz"
+
+  val lambda = 1e-8F
+  val batchSize = 1
+  val numItrs = 20
+
+  val maxNumWikificationOptions = 20 //7
+
+  val numLoadedSamples = -1 // for debugging by loading less samples
+
+
+  def main(args: Array[String]) = {
+    LightRunner.initializeOutput(DocumentedSetChooser.getClass)
+    LightRunner.populateScala(DocumentedSetChooser.getClass, args)
+
+    // load the documents
+
+
+
+    LightRunner.finalizeOutput()
+  }
+}
\ No newline at end of file
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index 9b206d5..d887378 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -284,7 +284,9 @@ object JointQueryDenotationChooser {
     // Make training examples, filtering out those with solutions that are unreachable because
     // they're not good for training
     val trainExs = extractExamples(trainCorefDocs, goldWikification, wikiDB, filterImpossible = true)
-    
+
+    // going to have make this system work on a set of a document
+
     // Extract features
     val featIndexer = new Indexer[String]
     val computer = new JointQueryDenotationChoiceComputer(wikiDB, featIndexer);
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
index de02806..f800213 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
@@ -167,7 +167,7 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
     ret
   }
 
-  def NGD[T](a: Set[T], b: Set[T], wsize: Int) : Double = {
+  /*def NGD[T](a: Set[T], b: Set[T], wsize: Int) : Double = {
     (logs(math.max(a.size, b.size)) - logs(intersectSize(a,b))) /
       (logs(wsize) - logs(math.min(a.size,b.size)))
   }
@@ -227,7 +227,7 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
         r += prefix + "isAboveAvg2"
       r.toArray
     }
-  }
+  }*/
 
   def featurizeQueriesAndDenotations_GLOW(queries: Seq[Query], denotations: Seq[String], addToIndexer: Boolean, wikiDB: WikipediaInterface): Array[Array[Array[Int]]] = {
     val queryOutcomes = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query));
@@ -236,21 +236,36 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
     val mentUpToHeadSize = ment.headIdx - ment.startIdx + 1;
     val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB)
 
-    val PMINGDvals = Seq(
+    /*val PMINGDvals = Seq(
       GLOWfeatures[Int](PMI, refLinksIn, "PMI-in-"),
       GLOWfeatures[Int](NGD, refLinksIn, "NGD-in-"),
       GLOWfeatures[Int](PMI, refLinksOut, "PMI-out-"),
       GLOWfeatures[Int](NGD, refLinksOut, "NGD-out-")
-    )
+    )*/
+
     // TODO: this is not correct,.....
+    // we need to know what we are going to annonate stuff in the document with,
+    // these are going to be denotations for a single example, which won't be useful
+    // so we need to get all the possible annontations for a given document
+    //
+    // in the wikification paper they have something that is choosing the references together
+    // need to look at pairs of references and
+
+
+
+
+    // TODO: implement the local vector features which compare the text of the pages
+    // the context can be the set of items linking into/outof a page? but then that isn't the similarity
+
 
 
     Array.tabulate(queries.size, denotations.size)((queryIdx, denIdx) => {
       val feats = new ArrayBuffer[Int];
       def feat(str: String) = addFeat(str, feats, addToIndexer);
-      for(p <- PMINGDvals)
+      /*for(p <- PMINGDvals)
         for(f <- p(denIdx))
           feat(f)
+      */
       val query = queries(queryIdx);
       val den = denotations(denIdx);
       if (den == NilToken) {
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
index 223cabf..d87fbfe 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala
@@ -56,7 +56,8 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB,
                          val redirectsDB: WikipediaRedirectsDB,
                          val categoryDB: WikipediaCategoryDB,
                          val linksDB: WikipediaLinkDB,
-                         val auxDB: WikipediaAuxDB) extends Serializable {
+                         val auxDB: WikipediaAuxDB,
+                         val textDB: WikipediaTextDB) extends Serializable {
   
   def getStandardPriorForJointModel(ment: Mention) = {
     val counter = new Counter[String];
@@ -207,7 +208,8 @@ object WikipediaInterface {
     }
     val categories = WikipediaCategoryDB.processWikipedia(wikipediaPath, allPageTargetsLc, parser, backoffParser);
     val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc);
-    val wi = new WikipediaInterface(titleGivenSurface, redirects, categories, links, aux);
+    val texts = WikipediaTextDB.processWikipedia(wikipediaPath, allPageTargetsLc);
+    val wi = new WikipediaInterface(titleGivenSurface, redirects, categories, links, aux, texts);
     wi.printSome();
     wi;
   }
@@ -222,7 +224,8 @@ object WikipediaInterface {
       new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]);
     }
     val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc);
-    val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux);
+    val texts = WikipediaTextDB.processWikipedia(wikipediaPath, allPageTargetsLc);
+    val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux, texts);
     wi.printSome();
     wi;
   }
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
new file mode 100644
index 0000000..66cb11f
--- /dev/null
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
@@ -0,0 +1,91 @@
+package edu.berkeley.nlp.entity.wiki
+
+import edu.berkeley.nlp.futile.fig.basic.{IOUtils, Indexer}
+import edu.berkeley.nlp.futile.util.Counter
+
+import scala.collection.JavaConversions._
+
+
+import scala.StringBuilder
+import scala.collection.mutable
+
+/**
+ * Created by matthewfl
+ *
+ * Provide proxy bow counts for documents so we can compute the similarity between two documents
+ */
+@SerialVersionUID(1L)
+class WikipediaTextDB (val indexer: Indexer[String], val words: mutable.HashMap[String, Counter[Int]]) {
+
+
+}
+
+object WikipediaTextDB {
+  def processWikipedia(wikipediaPath:String, querySet: Set[String]) : WikipediaTextDB = {
+    val lines = IOUtils.lineIterator(IOUtils.openInHard(wikipediaPath));
+    var currentPageTitle: String = null
+    val indexer = new Indexer[String]
+    val totalWordCounts = new Counter[Int]
+    var currentWordCounts = new Counter[Int]
+    val documentResults = new mutable.HashMap[String,Counter[Int]]
+    var lineIdx = 0
+    var numPagesSeen = 0
+    var doneWithThisPage = false
+
+    while(lines.hasNext) {
+      val line = lines.next
+      if (lineIdx % 100000 == 0) {
+        println("Line: " + lineIdx + ", processed " + numPagesSeen + " pages");
+      }
+      lineIdx += 1;
+      if (line.size > 8 && doneWithThisPage) {
+        // Do nothing
+      } else {
+        if(line.contains("<page>")) {
+          doneWithThisPage = false
+          numPagesSeen += 1
+        } else if (line.contains("<title>")) {
+          // 7 = "<title>".length()
+          currentPageTitle = line.substring(line.indexOf("<title>") + 7, line.indexOf("</title>"));
+          if (!querySet.contains(currentPageTitle.toLowerCase)) {
+            doneWithThisPage = true;
+          } else {
+            currentWordCounts = new Counter[Int]()
+            documentResults += (currentPageTitle -> currentWordCounts)
+          }
+        } else if(line.contains("<text")) {
+          val textStart = line.indexOf(">") + 1
+          var document = new StringBuilder()
+          var textEnd = line.indexOf("</text>")
+          if(textEnd != -1) {
+            document.append(line.substring(textStart, textEnd))
+          } else {
+            var curLine = line.substring(textStart)
+            while(textEnd == -1) {
+              document.append(curLine)
+              curLine = lines.next
+              textEnd = curLine.indexOf("</text>")
+            }
+            document.append(curLine.substring(0, textEnd))
+          }
+          // TODO: maybe toSet
+          document.toString.split("[^A-Za-z]").foreach(w => {
+            val i = indexer.getIndex(w)
+            totalWordCounts.incrementCount(i, 1.0)
+            currentWordCounts.incrementCount(i, 1.0)
+          })
+        }
+      }
+    }
+
+    // get the 300 most common words and remove them from all the documents
+    val wrdsq = totalWordCounts.asPriorityQueue
+    val removeWords = new mutable.HashSet[Int]()
+    for(i <- 0 until 300; if wrdsq.hasNext)
+      removeWords += wrdsq.next
+    documentResults.foreach(_._2.prune(removeWords))
+
+
+    new WikipediaTextDB(indexer, documentResults)
+  }
+}

From 92b5173c537d044a9657ccd188c485333c8b5692 Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Tue, 14 Apr 2015 11:53:09 -0700
Subject: [PATCH 24/25] some bug fixes and reduce memory pressure

---
 .../nlp/entity/wiki/WikipediaTextDB.scala     | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
index 66cb11f..5aef776 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
@@ -12,10 +12,10 @@ import scala.collection.mutable
 /**
  * Created by matthewfl
  *
- * Provide proxy bow counts for documents so we can compute the similarity between two documents
+ * Provide bow counts for documents so we can compute the similarity between two documents
  */
 @SerialVersionUID(1L)
-class WikipediaTextDB (val indexer: Indexer[String], val words: mutable.HashMap[String, Counter[Int]]) {
+class WikipediaTextDB (val indexer: Indexer[String], val words: mutable.HashMap[String, Array[Int]]) extends Serializable {
 
 
 }
@@ -26,8 +26,8 @@ object WikipediaTextDB {
     var currentPageTitle: String = null
     val indexer = new Indexer[String]
     val totalWordCounts = new Counter[Int]
-    var currentWordCounts = new Counter[Int]
-    val documentResults = new mutable.HashMap[String,Counter[Int]]
+    var currentWordCounts = new mutable.HashSet[Int]
+    val documentResults = new mutable.HashMap[String,Array[Int]]
     var lineIdx = 0
     var numPagesSeen = 0
     var doneWithThisPage = false
@@ -46,16 +46,19 @@ object WikipediaTextDB {
           numPagesSeen += 1
         } else if (line.contains("<title>")) {
           // 7 = "<title>".length()
-          currentPageTitle = line.substring(line.indexOf("<title>") + 7, line.indexOf("</title>"));
-          if (!querySet.contains(currentPageTitle.toLowerCase)) {
+          val newPageTitle = line.substring(line.indexOf("<title>") + 7, line.indexOf("</title>"));
+          if (!querySet.contains(newPageTitle.toLowerCase)) {
             doneWithThisPage = true;
           } else {
-            currentWordCounts = new Counter[Int]()
-            documentResults += (currentPageTitle -> currentWordCounts)
+            if(currentPageTitle != null) {
+              documentResults += (currentPageTitle -> currentWordCounts.toArray)
+            }
+            currentWordCounts = new mutable.HashSet[Int]()
+            currentPageTitle = newPageTitle
           }
         } else if(line.contains("<text")) {
           val textStart = line.indexOf(">") + 1
-          var document = new StringBuilder()
+          val document = new StringBuilder()
           var textEnd = line.indexOf("</text>")
           if(textEnd != -1) {
             document.append(line.substring(textStart, textEnd))
@@ -72,7 +75,8 @@ object WikipediaTextDB {
           document.toString.split("[^A-Za-z]").foreach(w => {
             val i = indexer.getIndex(w)
             totalWordCounts.incrementCount(i, 1.0)
-            currentWordCounts.incrementCount(i, 1.0)
+            currentWordCounts += i
+            //currentWordCounts.incrementCount(i, 1.0)
           })
         }
       }
@@ -83,7 +87,9 @@ object WikipediaTextDB {
     val removeWords = new mutable.HashSet[Int]()
     for(i <- 0 until 300; if wrdsq.hasNext)
       removeWords += wrdsq.next
-    documentResults.foreach(_._2.prune(removeWords))
+    for(k <- documentResults) {
+      documentResults(k._1) = k._2.filter(!removeWords.contains(_)).sorted
+    }
 
 
     new WikipediaTextDB(indexer, documentResults)

From 1a5e9454d21e285084c5e338b7153cb3e8b86e9d Mon Sep 17 00:00:00 2001
From: Matthew Francis-Landau <matthew@matthewfl.com>
Date: Sat, 18 Apr 2015 17:20:14 -0700
Subject: [PATCH 25/25] fixes for document word vectors

---
 .../edu/berkeley/nlp/entity/Document.scala    |  2 ++
 .../wiki/JointQueryDenotationChooser.scala    |  4 +++
 .../nlp/entity/wiki/QueryChooser.scala        | 13 ++++---
 .../nlp/entity/wiki/WikipediaTextDB.scala     | 34 ++++++++++++++++++-
 4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/src/main/java/edu/berkeley/nlp/entity/Document.scala b/src/main/java/edu/berkeley/nlp/entity/Document.scala
index 44555df..cf95766 100644
--- a/src/main/java/edu/berkeley/nlp/entity/Document.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/Document.scala
@@ -31,4 +31,6 @@ trait Document {
   def isConversation : Boolean = false
 
   def getCorrespondingNERChunk (sentIdx : Int, headIdx : Int) : Option[Chunk[String]]
+
+  var documentVectorCache: Array[Int] = null
 }
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
index d887378..b145732 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala
@@ -29,6 +29,8 @@ case class JointQueryDenotationExample(val queries: Seq[Query],
   // Feature caches since feature computation is expensive if redone every time
   var cachedFeatsEachQuery: Array[Array[Int]] = null;
   var cachedFeatsEachQueryDenotation: Array[Array[Array[Int]]] = null;
+
+  def document = queries.head.originalMent.rawDoc
 }
 
 /**
@@ -43,6 +45,8 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface,
 
   def featurizeUseCache(ex: JointQueryDenotationExample, addToIndexer: Boolean) {
     if (ex.cachedFeatsEachQuery == null) {
+      if(ex.document.documentVectorCache == null)
+        ex.document.documentVectorCache = wikiDB.textDB.makeVector(ex.document.words)
       ex.cachedFeatsEachQuery = queryChooser.featurizeQueries(ex.queries, addToIndexer)
       ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations_GLOW(ex.queries, ex.allDenotations, addToIndexer, wikiDB)
     }
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
index f800213..712ad3d 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala
@@ -234,9 +234,9 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
     val queryNonemptyList = queryOutcomes.map(_.isEmpty);
     val ment = queries.head.originalMent;
     val mentUpToHeadSize = ment.headIdx - ment.startIdx + 1;
-    val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB)
+    /*val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB)
 
-    /*val PMINGDvals = Seq(
+    val PMINGDvals = Seq(
       GLOWfeatures[Int](PMI, refLinksIn, "PMI-in-"),
       GLOWfeatures[Int](NGD, refLinksIn, "NGD-in-"),
       GLOWfeatures[Int](PMI, refLinksOut, "PMI-out-"),
@@ -251,8 +251,9 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
     // in the wikification paper they have something that is choosing the references together
     // need to look at pairs of references and
 
-
-
+    val denotationSim = denotations.map(t => wikiDB.textDB.compareDocumentC(ment.rawDoc.documentVectorCache, t))
+    val denotationSimMax = denotationSim.max
+    val denotationSimAvg = denotationSim.sum / denotationSim.size
 
     // TODO: implement the local vector features which compare the text of the pages
     // the context can be the set of items linking into/outof a page? but then that isn't the similarity
@@ -287,6 +288,10 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface,
         if (denotationHasParenthetical) {
           feat("MatchesQueryUpToParen=" + queryDescriptorWithProper + "-" + (den.substring(0, den.indexOf("(")).trim.toLowerCase == queryStr.toLowerCase))
         }
+        feat("CompariableWordsLog="+Math.floor(Math.log(denotationSim(denIdx))))
+        feat("CompariableIsMaxWordSim=" + (denotationSim(denIdx) == denotationSimMax))
+        feat("CompariableWordsAboveAvg=" + (denotationSim(denIdx) > denotationSimAvg))
+        feat("CompariableWordsReweight="+Math.floor(denotationSim(denIdx) / denotationSimMax * 10))
       } else {
         feat("Impossible");
       }
diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
index 5aef776..aca157a 100644
--- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
+++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala
@@ -17,6 +17,38 @@ import scala.collection.mutable
 @SerialVersionUID(1L)
 class WikipediaTextDB (val indexer: Indexer[String], val words: mutable.HashMap[String, Array[Int]]) extends Serializable {
 
+  def getDocument(title: String) = words.getOrElse(title, Array[Int]())
+
+  def compareVectors(a: Array[Int], b: Array[Int]) = {
+    var ai = 0
+    var bi = 0
+    var simcnt = 0
+    while(ai < a.size && bi < b.size) {
+      if(a(ai) == b(bi)) {
+        simcnt += 1
+        ai += 1
+        bi += 1
+      } else if(a(ai) > b(bi)) {
+        bi += 1
+      } else {
+        ai += 1
+      }
+    }
+    simcnt
+  }
+
+  def compareTitles(atitle: String, btitle: String) = compareVectors(getDocument(atitle), getDocument(btitle))
+
+  def makeVector(document: Seq[Seq[String]]) = {
+    document.flatMap(_.map(v => indexer.indexOf(v.toLowerCase))).toSet.filter(_ != -1).toArray.sorted
+  }
+
+  def compareDocument(doc: Array[Int], title: String) = compareVectors(doc, getDocument(title))
+
+  def compareDocumentC(doc: Array[Int], title: String) = {
+    val tdoc = getDocument(title)
+    compareVectors(doc, tdoc).asInstanceOf[Double] / (doc.size * tdoc.size)
+  }
 
 }
 
@@ -73,7 +105,7 @@ object WikipediaTextDB {
           }
           // TODO: maybe toSet
           document.toString.split("[^A-Za-z]").foreach(w => {
-            val i = indexer.getIndex(w)
+            val i = indexer.getIndex(w.toLowerCase)
             totalWordCounts.incrementCount(i, 1.0)
             currentWordCounts += i
             //currentWordCounts.incrementCount(i, 1.0)