From ad98fef199b8939c04a0ce197ced5adbeb9e01b5 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sat, 21 Feb 2015 01:21:11 -0800 Subject: [PATCH 01/25] start to wiki interface, looks like sentence splitting is working --- .gitignore | 8 ++ Makefile | 14 ++++ .../edu/berkeley/nlp/entity/ConllDoc.scala | 16 ++-- .../berkeley/nlp/entity/ConllDocReader.scala | 6 +- .../berkeley/nlp/entity/ConllDocWriter.scala | 10 +-- .../edu/berkeley/nlp/entity/Document.scala | 34 ++++++++ .../berkeley/nlp/entity/EntitySystem.scala | 2 +- .../edu/berkeley/nlp/entity/WikiDoc.scala | 34 ++++++++ .../berkeley/nlp/entity/WikiDocReader.scala | 77 +++++++++++++++++++ .../nlp/entity/coref/CorefConllScorer.scala | 8 +- .../berkeley/nlp/entity/coref/CorefDoc.scala | 4 +- .../nlp/entity/coref/CorefDocAssembler.scala | 34 ++++---- .../entity/coref/CorefDocAssemblerACE.scala | 4 +- .../nlp/entity/coref/CorefSystem.scala | 2 +- .../berkeley/nlp/entity/coref/Mention.scala | 6 +- .../berkeley/nlp/entity/joint/JointDoc.scala | 8 +- .../nlp/entity/joint/JointDocACE.scala | 6 +- .../nlp/entity/joint/JointPredictor.scala | 2 +- .../nlp/entity/joint/JointPredictorACE.scala | 2 +- .../berkeley/nlp/entity/ner/NEEvaluator.scala | 6 +- .../nlp/entity/ner/NESentenceMunger.scala | 4 +- .../berkeley/nlp/entity/ner/NerPruner.scala | 10 +-- .../nlp/entity/ner/NerSystemLabeled.scala | 15 ++-- .../nlp/entity/preprocess/Reprocessor.scala | 2 +- .../entity/preprocess/SentenceSplitter.scala | 8 +- .../nlp/entity/wiki/WikipediaInterface.scala | 7 +- 26 files changed, 248 insertions(+), 81 deletions(-) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 src/main/java/edu/berkeley/nlp/entity/Document.scala create mode 100644 src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala create mode 100644 src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d7ce67f --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +berkeley-entity-models.tgz +data.tgz +data/ +expers/ +models/ +project/project/ +project/target/ +target/ \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..eab9e29 --- /dev/null +++ b/Makefile @@ -0,0 +1,14 @@ +# some random useful functions + +TARGET = target/scala-2.11/berkeley-entity-assembly-1.jar + +all: $(TARGET) + +$(TARGET): $(wildcard src/**) + sbt assembly + +aceTester: $(TARGET) + java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.ACETester -dataPath data/ace05/ace05-all-conll + +queryModel: $(TARGET) + java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.QueryChooser -wikiDBPath models/wiki-db-ace.ser.gz diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala index d29aaa0..b4012e9 100644 --- a/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala +++ b/src/main/java/edu/berkeley/nlp/entity/ConllDoc.scala @@ -17,13 +17,13 @@ case class ConllDoc(val docID: String, val trees: Seq[DepConstTree], val nerChunks: Seq[Seq[Chunk[String]]], val corefChunks: Seq[Seq[Chunk[Int]]], - val speakers: Seq[Seq[String]]) { + val speakers: Seq[Seq[String]]) extends Document { - val numSents = words.size; + override val numSents = words.size; - def uid = docID -> docPartNo; + override def uid = docID -> docPartNo; - def fileName = { + override def fileName = { if (docID.contains("/")) { docID.substring(docID.lastIndexOf("/") + 1); } else { @@ -31,11 +31,11 @@ case class ConllDoc(val docID: String, } } - def printableDocName = docID + " (part " + docPartNo + ")"; + override def printableDocName = docID + " (part " + docPartNo + ")"; - def isConversation = docID.startsWith("bc") || docID.startsWith("wb"); - - def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = ConllDoc.getCorrespondingNERChunk(nerChunks(sentIdx), headIdx); + override def isConversation = docID.startsWith("bc") || docID.startsWith("wb") + + override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = ConllDoc.getCorrespondingNERChunk(nerChunks(sentIdx), headIdx); } object ConllDoc { diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala index 91685f3..9847abd 100644 --- a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala +++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala @@ -38,7 +38,7 @@ class ConllDocReader(val lang: Language, case _ => throw new RuntimeException("Bad language, no head finder for " + lang); } - def readConllDocs(fileName: String): Seq[ConllDoc] = { + def readConllDocs(fileName: String): Seq[Document] = { val fcn = (docID: String, docPartNo: Int, docBySentencesByLines: ArrayBuffer[ArrayBuffer[String]]) => assembleConllDoc(docBySentencesByLines, docID, docPartNo); ConllDocReader.readConllDocsGeneral(fileName, fcn); } @@ -283,7 +283,7 @@ object ConllDocReader { // loadRawConllDocsWithSuffix(path, size, if (gold) "gold_conll" else "auto_conll", lang, betterParsesFile); // } - def loadRawConllDocsWithSuffix(path: String, size: Int, suffix: String, lang: Language = Language.ENGLISH, betterParsesFile: String = ""): Seq[ConllDoc] = { + def loadRawConllDocsWithSuffix(path: String, size: Int, suffix: String, lang: Language = Language.ENGLISH, betterParsesFile: String = ""): Seq[Document] = { Logger.logss("Loading " + size + " docs from " + path + " ending with " + suffix); val rawDir = new File(path); if (!rawDir.exists() || !rawDir.canRead() || rawDir.listFiles == null || rawDir.listFiles.isEmpty) { @@ -292,7 +292,7 @@ object ConllDocReader { val rawFiles = rawDir.listFiles.sortBy(_.getAbsolutePath()); val files = rawFiles.filter(file => file.getAbsolutePath.endsWith(suffix)); val reader = new ConllDocReader(lang, betterParsesFile); - val docs = new ArrayBuffer[ConllDoc]; + val docs = new ArrayBuffer[Document]; var docCounter = 0; var fileIdx = 0; while (fileIdx < files.size && (size == -1 || docCounter < size)) { diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala index 395a268..422a694 100644 --- a/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala +++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocWriter.scala @@ -16,7 +16,7 @@ import edu.berkeley.nlp.entity.wiki.WikiAnnotReaderWriter object ConllDocWriter { - def writeDoc(writer: PrintWriter, conllDoc: ConllDoc, clustering: OrderedClusteringBound) { + def writeDoc(writer: PrintWriter, conllDoc: Document, clustering: OrderedClusteringBound) { writeIncompleteConllDoc(writer, conllDoc.docID, conllDoc.docPartNo, conllDoc.words, conllDoc.pos, conllDoc.trees.map(_.constTree), conllDoc.speakers, conllDoc.nerChunks, convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size)); // val corefBits = getCorefBits(conllDoc.words.map(_.size), convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size)); // val numZeroesToAddToPartNo = 3 - conllDoc.docPartNo.toString.size; @@ -35,7 +35,7 @@ object ConllDocWriter { } def writeDocWithPredAnnotations(writer: PrintWriter, - conllDoc: ConllDoc, + conllDoc: Document, nerChunks: Seq[Seq[Chunk[String]]], corefClustering: OrderedClusteringBound, wikiChunks: Option[Seq[Seq[Chunk[String]]]] = None) { @@ -45,7 +45,7 @@ object ConllDocWriter { def writeDocWithPredAnnotationsWikiStandoff(writer: PrintWriter, standoffWriter: PrintWriter, - conllDoc: ConllDoc, + conllDoc: Document, nerChunks: Seq[Seq[Chunk[String]]], corefClustering: OrderedClusteringBound, wikiChunks: Seq[Seq[Chunk[String]]]) { @@ -54,7 +54,7 @@ object ConllDocWriter { } def writeIncompleteConllDoc(writer: PrintWriter, - doc: ConllDoc) { + doc: Document) { writeIncompleteConllDocNestedNER(writer, doc.docID, doc.docPartNo, doc.words, doc.pos, doc.trees.map(_.constTree), doc.speakers, doc.nerChunks, doc.corefChunks); } @@ -210,7 +210,7 @@ object ConllDocWriter { } } - def writeDocIllinoisColumnFormat(writer: PrintWriter, conllDoc: ConllDoc) { + def writeDocIllinoisColumnFormat(writer: PrintWriter, conllDoc: Document) { writer.println("O\t0\t0\tO\t-X-\t-DOCSTART-\tx\tx\t0"); // B-LOC 0 0 I-NP NNP Portugal x x 0 diff --git a/src/main/java/edu/berkeley/nlp/entity/Document.scala b/src/main/java/edu/berkeley/nlp/entity/Document.scala new file mode 100644 index 0000000..8a2ef9d --- /dev/null +++ b/src/main/java/edu/berkeley/nlp/entity/Document.scala @@ -0,0 +1,34 @@ +package edu.berkeley.nlp.entity + +/** + * Created by matthew on 2/18/15. + */ +trait Document { + def docID : String + def docPartNo : Int + // arrays of words in each sentence including punc + def words : Seq[Seq[String]] + // the gram types of the words + def pos : Seq[Seq[String]] + // parse trees of each sentence + def trees : Seq[DepConstTree] + // I am guessing the type of the chunk eg: ORG-NAM + def nerChunks : Seq[Seq[Chunk[String]]] + // have ranges and identifiers for the unique item that they are referenceing + // appears [start, end) + def corefChunks : Seq[Seq[Chunk[Int]]] + // just use "-" for each in the case that the speaker is unknown + def speakers : Seq[Seq[String]] + + def numSents : Int = -1 + + def uid : (String, Int) = docID -> docPartNo + + def fileName : String + + def printableDocName : String + + def isConversation : Boolean = false + + def getCorrespondingNERChunk (sentIdx : Int, headIdx : Int) : Option[Chunk[String]] +} diff --git a/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala b/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala index 1fad8ce..2bf9fa3 100644 --- a/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala +++ b/src/main/java/edu/berkeley/nlp/entity/EntitySystem.scala @@ -153,7 +153,7 @@ object EntitySystem { ConllDocReader.loadRawConllDocsWithSuffix(goldPath, size, goldSuffix)); } else { (ConllDocReader.loadRawConllDocsWithSuffix(path, size, suffix), - new ArrayBuffer[ConllDoc]()); + new ArrayBuffer[Document]()); } val goldWikification = new HashMap[String,HashMap[Int,ArrayBuffer[Chunk[String]]]]; val assembler = CorefDocAssembler(Driver.lang, Driver.useGoldMentions); diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala new file mode 100644 index 0000000..bcec448 --- /dev/null +++ b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala @@ -0,0 +1,34 @@ +package edu.berkeley.nlp.entity + +/** + * Created by matthew on 2/18/15. + */ +case class WikiDoc (docID : String, + docPartNo : Int, + words : Seq[Seq[String]], + pos : Seq[Seq[String]], + trees: Seq[DepConstTree], + nerChunks : Seq[Seq[Chunk[String]]], + corefChunks : Seq[Seq[Chunk[Int]]], + speakers : Seq[Seq[String]] ) extends Document { + + override val numSents = words.size; + + override def uid = docID -> docPartNo; + + override def fileName = { + if (docID.contains("/")) { + docID.substring(docID.lastIndexOf("/") + 1); + } else { + docID; + } + } + + override def printableDocName = docID + " (part " + docPartNo + ")"; + + override def isConversation = docID.startsWith("bc") || docID.startsWith("wb") + + override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = None; + + +} diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala new file mode 100644 index 0000000..0896864 --- /dev/null +++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala @@ -0,0 +1,77 @@ +package edu.berkeley.nlp.entity + +import java.io.File + +import edu.berkeley.nlp.entity.lang.{ModCollinsHeadFinder, Language} +import edu.berkeley.nlp.entity.preprocess.SentenceSplitter +import edu.berkeley.nlp.futile.syntax.Tree + +import scala.collection.immutable.HashMap +import scala.collection.mutable.ArrayBuffer +import scala.xml._ + +/** + * Created by matthew on 2/18/15. + */ +class WikiDocReader (val lang : Language, val betterParsesFile : String = "") { + + val betterParses = new HashMap[ArrayBuffer[String], Tree[String]] + + // TODO: betterParsesFile + + val headFinder = lang match { + case Language.ENGLISH => new ModCollinsHeadFinder() + case _ => throw new RuntimeException() + } + + val sentenceSplitter = SentenceSplitter.loadSentenceSplitter("models/sentsplit.txt.gz") + + def readWikiDocs(fileName : String) : Seq[WikiDoc] = { + val referencesFile = fileName.replace("RawTexts", "Problems"); + val refxml = XML.loadFile(referencesFile); + val document = scala.io.Source.fromFile(fileName).mkString + + //val splits = sentenceSplitter.formCanonicalizedParagraphs(document.split(" "), false, false) + val splits = sentenceSplitter.splitSentences(document.split("\n").filter(!_.trim.isEmpty)) + + + + for(reference <- refxml \ "ReferenceInstance") { + val surfaceForm = (reference \ "SurfaceForm")(0).text.trim + val offset = (reference \ "offset")(0).text.trim.toInt + val length = (reference \ "length")(0).text.trim.toInt + val chosenAnnotation = (reference \ "ChosenAnnotation")(0).text.trim + val annotatorId = (reference \ "AnnotatorId")(0).text.trim + val annotation = (reference \ "Annotation")(0).text.trim + + + } + + // docID some unique identifier, filename + // partNo some int cnt + // words an array of sentences + // trees set of parse trees for a given sentence entity.DepConstTree + // nerchunks entity.Chunk + + + Seq[WikiDoc]() + } + +} + +object WikiDocReader { + def loadRawWikiDocs(path : String, size : Int, suffix : String, lang : Language = Language.ENGLISH, betterParsesFile : String = "") : Seq[Document] = { + val rawDir = new File(path) + if (!rawDir.exists() || !rawDir.canRead() || rawDir.listFiles == null || rawDir.listFiles.isEmpty) { + throw new RuntimeException("Couldn't find directory " + path); + } + var rawFiles = rawDir.listFiles.map(_.getAbsolutePath()) + //val files = rawFiles.filter(file => file.getAbsolutePath.endsWith(suffix)); + val reader = new WikiDocReader(lang, betterParsesFile) + val docs = new ArrayBuffer[Document] + for(fname <- rawFiles) { + docs ++= reader.readWikiDocs(fname) + } + docs + } +} \ No newline at end of file diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala index bfd8b14..ee9b457 100644 --- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala +++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefConllScorer.scala @@ -9,22 +9,22 @@ import scala.sys.process.stringSeqToProcess import scala.sys.process.Process import edu.berkeley.nlp.futile.util.Logger import edu.berkeley.nlp.entity.Driver -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.ConllDocWriter class CorefConllScorer(val conllEvalScriptPath: String) { - def renderFinalScore(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound]) = { + def renderFinalScore(conllDocs: Seq[Document], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound]) = { val summary = score(conllDocs, rawPredClusterings, goldClusterings, true); CorefConllScorer.processConllString(summary, false); } - def renderSuffStats(conllDoc: ConllDoc, rawPredClustering: OrderedClusteringBound, goldClustering: OrderedClusteringBound) = { + def renderSuffStats(conllDoc: Document, rawPredClustering: OrderedClusteringBound, goldClustering: OrderedClusteringBound) = { val summary = score(Seq(conllDoc), Seq(rawPredClustering), Seq(goldClustering), false); CorefConllScorer.processConllString(summary, true); } - def score(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound], saveTempFiles: Boolean) = { + def score(conllDocs: Seq[Document], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound], saveTempFiles: Boolean) = { val predClusterings = if (Driver.doConllPostprocessing) rawPredClusterings.map(_.postprocessForConll()) else rawPredClusterings; // var predFile = File.createTempFile("temp", ".conll"); val (predFile, goldFile) = if (Driver.conllOutputDir != "" && saveTempFiles) { diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala index f7cc4b6..f5634fb 100644 --- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala +++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDoc.scala @@ -10,9 +10,9 @@ import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer import edu.berkeley.nlp.futile.util.Counter import edu.berkeley.nlp.futile.util.Logger import edu.berkeley.nlp.entity.GUtil -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document -case class CorefDoc(val rawDoc: ConllDoc, +case class CorefDoc(val rawDoc: Document, val goldMentions: Seq[Mention], val goldClustering: OrderedClustering, val predMentions: Seq[Mention]) { diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala index 9c369e3..413e1cd 100644 --- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala +++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssembler.scala @@ -12,17 +12,17 @@ import edu.berkeley.nlp.entity.lang.ChineseCorefLanguagePack import edu.berkeley.nlp.entity.lang.ArabicCorefLanguagePack import edu.berkeley.nlp.futile.util.Counter import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document case class ProtoMention(val sentIdx: Int, val startIdx: Int, val endIdx: Int, val headIdx: Int); case class ProtoMentionFancy(val sentIdx: Int, val startIdx: Int, val endIdx: Int, val headIndices: Seq[Int]); -case class ProtoCorefDoc(val doc: ConllDoc, val goldMentions: Seq[Mention], val predProtoMentions: Seq[ProtoMention]); +case class ProtoCorefDoc(val doc: Document, val goldMentions: Seq[Mention], val predProtoMentions: Seq[ProtoMention]); class CorefDocAssembler(val langPack: CorefLanguagePack, val useGoldMentions: Boolean) { - def createCorefDoc(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = { + def createCorefDoc(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = { val (goldMentions, goldClustering) = extractGoldMentions(rawDoc, propertyComputer); if (goldMentions.size == 0) { Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName); @@ -31,7 +31,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions) } - def createCorefDocFancy(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, possibleChunks: Seq[Seq[Chunk[Boolean]]]): CorefDoc = { + def createCorefDocFancy(rawDoc: Document, propertyComputer: MentionPropertyComputer, possibleChunks: Seq[Seq[Chunk[Boolean]]]): CorefDoc = { val (goldMentions, goldClustering) = extractGoldMentions(rawDoc, propertyComputer); if (goldMentions.size == 0) { Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName); @@ -41,11 +41,11 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions) } - def extractGoldMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = { + def extractGoldMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = { CorefDocAssembler.extractGoldMentions(rawDoc, propertyComputer, langPack); } - def extractPredMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = { + def extractPredMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = { val protoMentionsSorted = getProtoMentionsSorted(rawDoc, gms); val finalMentions = new ArrayBuffer[Mention](); for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) { @@ -54,7 +54,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, finalMentions; } - def extractPredMentionsFancy(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Mention] = { + def extractPredMentionsFancy(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Mention] = { val protoMentionsSorted = getProtoMentionsSortedFancy(rawDoc, gms, possibleChunks); val finalMentions = new ArrayBuffer[Mention](); for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) { @@ -63,7 +63,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, finalMentions; } - private def getProtoMentionsSorted(rawDoc: ConllDoc, gms: Seq[Mention]): Seq[Seq[ProtoMention]] = { + private def getProtoMentionsSorted(rawDoc: Document, gms: Seq[Mention]): Seq[Seq[ProtoMention]] = { val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMention]); for (sentIdx <- 0 until rawDoc.numSents) { // Extract NE spans: filter out O, QUANTITY, CARDINAL, CHUNK @@ -131,7 +131,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, // } } - private def getProtoMentionsSortedFancy(rawDoc: ConllDoc, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Seq[ProtoMention]] = { + private def getProtoMentionsSortedFancy(rawDoc: Document, gms: Seq[Mention], possibleChunks: Seq[Seq[Chunk[Boolean]]]): Seq[Seq[ProtoMention]] = { val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMention]); for (sentIdx <- 0 until rawDoc.numSents) { // Extract NPs and PRPs *except* for those contained in NE chunks (the NE tagger seems more reliable than the parser) @@ -154,7 +154,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, } } - private def filterNonMaximalNPs(rawDoc: ConllDoc, mentionExtents: Seq[HashSet[ProtoMention]]) = { + private def filterNonMaximalNPs(rawDoc: Document, mentionExtents: Seq[HashSet[ProtoMention]]) = { val filteredProtoMentionsSorted = (0 until rawDoc.numSents).map(i => new ArrayBuffer[ProtoMention]); for (sentIdx <- 0 until mentionExtents.size) { val protoMentionsByHead = mentionExtents(sentIdx).groupBy(_.headIdx); @@ -211,7 +211,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, ////////////////// - def createCorefDocWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = { + def createCorefDocWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = { val (goldMentions, goldClustering) = extractGoldMentionsWithCoordination(rawDoc, propertyComputer); if (goldMentions.size == 0) { Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName); @@ -220,7 +220,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, new CorefDoc(rawDoc, goldMentions, goldClustering, predMentions) } - def extractGoldMentionsWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = { + def extractGoldMentionsWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer): (Seq[Mention], OrderedClustering) = { val goldProtoMentionsSorted = getGoldProtoMentionsSortedWithCoordination(rawDoc); val finalMentions = new ArrayBuffer[Mention](); val goldClusterLabels = new ArrayBuffer[Int](); @@ -238,7 +238,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, (finalMentions, OrderedClustering.createFromClusterIds(goldClusterLabels)); } - def extractPredMentionsWithCoordination(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = { + def extractPredMentionsWithCoordination(rawDoc: Document, propertyComputer: MentionPropertyComputer, gms: Seq[Mention]): Seq[Mention] = { val protoMentionsSorted = getProtoMentionsSortedWithCoordination(rawDoc, gms); val finalMentions = new ArrayBuffer[Mention](); for (sentProtoMents <- protoMentionsSorted; protoMent <- sentProtoMents) { @@ -247,7 +247,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, finalMentions; } - private def getGoldProtoMentionsSortedWithCoordination(rawDoc: ConllDoc): Seq[Seq[ProtoMentionFancy]] = { + private def getGoldProtoMentionsSortedWithCoordination(rawDoc: Document): Seq[Seq[ProtoMentionFancy]] = { val goldProtoMentions = for (sentIdx <- 0 until rawDoc.corefChunks.size) yield { for (chunk <- rawDoc.corefChunks(sentIdx)) yield { val headIndices = rawDoc.trees(sentIdx).getSpanHeadOrNPCoordinatedHeads(chunk.start, chunk.end); @@ -257,7 +257,7 @@ class CorefDocAssembler(val langPack: CorefLanguagePack, goldProtoMentions.map(_.sortBy(ment => (ment.sentIdx, ment.headIndices.head, ment.endIdx, ment.startIdx))); } - private def getProtoMentionsSortedWithCoordination(rawDoc: ConllDoc, gms: Seq[Mention]): Seq[Seq[ProtoMentionFancy]] = { + private def getProtoMentionsSortedWithCoordination(rawDoc: Document, gms: Seq[Mention]): Seq[Seq[ProtoMentionFancy]] = { val mentionExtents = (0 until rawDoc.numSents).map(i => new HashSet[ProtoMentionFancy]); for (sentIdx <- 0 until rawDoc.numSents) { // Extract NE spans: filter out O, QUANTITY, CARDINAL, CHUNK @@ -442,7 +442,7 @@ object CorefDocAssembler { new CorefDocAssembler(langPack, useGoldMentions); } - def extractGoldMentions(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer, langPack: CorefLanguagePack): (Seq[Mention], OrderedClustering) = { + def extractGoldMentions(rawDoc: Document, propertyComputer: MentionPropertyComputer, langPack: CorefLanguagePack): (Seq[Mention], OrderedClustering) = { val goldProtoMentionsSorted = getGoldProtoMentionsSorted(rawDoc); val finalMentions = new ArrayBuffer[Mention](); val goldClusterLabels = new ArrayBuffer[Int](); @@ -460,7 +460,7 @@ object CorefDocAssembler { (finalMentions, OrderedClustering.createFromClusterIds(goldClusterLabels)); } - def getGoldProtoMentionsSorted(rawDoc: ConllDoc): Seq[Seq[ProtoMention]] = { + def getGoldProtoMentionsSorted(rawDoc: Document): Seq[Seq[ProtoMention]] = { val goldProtoMentions = for (sentIdx <- 0 until rawDoc.corefChunks.size) yield { for (chunk <- rawDoc.corefChunks(sentIdx)) yield { val headIdx = rawDoc.trees(sentIdx).getSpanHead(chunk.start, chunk.end); diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala index cacd259..41a80e3 100644 --- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala +++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefDocAssemblerACE.scala @@ -5,13 +5,13 @@ import edu.berkeley.nlp.futile.util.Logger import scala.collection.mutable.ArrayBuffer import edu.berkeley.nlp.entity.wiki.ACEMunger import java.io.File -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document class CorefDocAssemblerACE(dirPath: String) { val langPack = new EnglishCorefLanguagePack() - def createCorefDoc(rawDoc: ConllDoc, propertyComputer: MentionPropertyComputer): CorefDoc = { + def createCorefDoc(rawDoc: Document, propertyComputer: MentionPropertyComputer): CorefDoc = { val (goldMentions, goldClustering) = CorefDocAssembler.extractGoldMentions(rawDoc, propertyComputer, langPack); if (goldMentions.size == 0) { Logger.logss("WARNING: no gold mentions on document " + rawDoc.printableDocName); diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala b/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala index 208c342..85adc64 100644 --- a/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala +++ b/src/main/java/edu/berkeley/nlp/entity/coref/CorefSystem.scala @@ -39,7 +39,7 @@ import edu.berkeley.nlp.entity.xdistrib.DocumentGraphComponents import edu.berkeley.nlp.futile.fig.exec.Execution import edu.berkeley.nlp.entity.Driver import edu.berkeley.nlp.entity.GUtil -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.WordNetInterfacer import edu.berkeley.nlp.entity.ConllDocWriter import edu.berkeley.nlp.entity.ConllDocReader diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala index 58b9cd4..8069292 100644 --- a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala +++ b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala @@ -8,14 +8,14 @@ import edu.berkeley.nlp.entity.sem.SemClasser import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer import edu.berkeley.nlp.futile.util.Counter import edu.berkeley.nlp.entity.Chunk -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.Driver; import edu.berkeley.nlp.entity.WordNetInterfacer // TODO: Extract an interface for ConllDoc so I don't have to keep the whole // document around...but while I'm feature engineering it's useful to be able // to put my hands on anything I want -class Mention(val rawDoc: ConllDoc, +class Mention(val rawDoc: Document, val mentIdx: Int, val sentIdx: Int, val startIdx: Int, @@ -247,7 +247,7 @@ object Mention { val StartPosPlaceholder = ""; val EndPosPlaceholder = ""; - def createMentionComputeProperties(rawDoc: ConllDoc, + def createMentionComputeProperties(rawDoc: Document, mentIdx: Int, sentIdx: Int, startIdx: Int, diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala index 512cc27..a78e96f 100644 --- a/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala +++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointDoc.scala @@ -15,10 +15,10 @@ import edu.berkeley.nlp.entity.Driver import edu.berkeley.nlp.entity.ner.NerFeaturizer import scala.collection.mutable.HashSet import edu.berkeley.nlp.futile.util.Logger -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.ner.NerPruner -class JointDoc(val rawDoc: ConllDoc, +class JointDoc(val rawDoc: Document, val docGraph: DocumentGraph, val goldNERChunks: Seq[Seq[Chunk[String]]], val goldWikiChunks: Seq[Seq[Chunk[String]]]) { @@ -71,7 +71,7 @@ class JointDoc(val rawDoc: ConllDoc, object JointDoc { - def apply(rawDoc: ConllDoc, + def apply(rawDoc: Document, docGraph: DocumentGraph, maybeGoldNERChunks: Option[Seq[Seq[Chunk[String]]]], maybeGoldWikiChunks: Option[Seq[Seq[Chunk[String]]]]) = { @@ -89,7 +89,7 @@ object JointDoc { } def assembleJointDocs(docGraphs: Seq[DocumentGraph], - goldConllDocsForNER: Seq[ConllDoc], + goldConllDocsForNER: Seq[Document], goldWikification: HashMap[String,HashMap[Int,ArrayBuffer[Chunk[String]]]]) = { docGraphs.map(docGraph => { val rawDoc = docGraph.corefDoc.rawDoc; diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala index fc78b5e..85c9683 100644 --- a/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala +++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointDocACE.scala @@ -5,13 +5,13 @@ import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import edu.berkeley.nlp.entity.Chunk -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.coref.DocumentGraph import edu.berkeley.nlp.entity.coref.Mention import edu.berkeley.nlp.entity.wiki._ import edu.berkeley.nlp.futile.util.Logger -class JointDocACE(val rawDoc: ConllDoc, +class JointDocACE(val rawDoc: Document, val docGraph: DocumentGraph, val goldWikiChunks: Seq[Seq[Chunk[Seq[String]]]]) { @@ -36,7 +36,7 @@ class JointDocACE(val rawDoc: ConllDoc, object JointDocACE { - def apply(rawDoc: ConllDoc, + def apply(rawDoc: Document, docGraph: DocumentGraph, maybeGoldWikiChunks: Option[Seq[Seq[Chunk[Seq[String]]]]]) = { val goldWikiChunks = if (maybeGoldWikiChunks.isDefined) { diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala index 667672b..afeb3f7 100644 --- a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictor.scala @@ -3,7 +3,7 @@ package edu.berkeley.nlp.entity.joint import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import edu.berkeley.nlp.entity.Chunk -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.ConllDocReader import edu.berkeley.nlp.entity.ConllDocWriter import edu.berkeley.nlp.entity.GUtil diff --git a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala index 71e9274..cf93562 100644 --- a/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala +++ b/src/main/java/edu/berkeley/nlp/entity/joint/JointPredictorACE.scala @@ -12,7 +12,7 @@ import edu.berkeley.nlp.entity.coref.CorefDocAssembler import scala.collection.mutable.HashMap import scala.collection.mutable.ArrayBuffer import edu.berkeley.nlp.entity.Chunk -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.coref.DocumentGraph import edu.berkeley.nlp.futile.fig.exec.Execution import edu.berkeley.nlp.entity.coref.CorefEvaluator diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala index a0f4c96..0627b42 100644 --- a/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala +++ b/src/main/java/edu/berkeley/nlp/entity/ner/NEEvaluator.scala @@ -1,6 +1,6 @@ package edu.berkeley.nlp.entity.ner -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.coref.Mention import edu.berkeley.nlp.futile.util.Logger import edu.berkeley.nlp.entity.coref.CorefSystem @@ -53,11 +53,11 @@ object NEEvaluator { })); } - def evaluate(goldDocs: Seq[ConllDoc], predDocs: Seq[ConllDoc]) { + def evaluate(goldDocs: Seq[Document], predDocs: Seq[Document]) { evaluateChunks(goldDocs, predDocs.map(_.nerChunks)); } - def evaluateChunks(goldDocs: Seq[ConllDoc], allPredChunks: Seq[Seq[Seq[Chunk[String]]]]) { + def evaluateChunks(goldDocs: Seq[Document], allPredChunks: Seq[Seq[Seq[Chunk[String]]]]) { var correct = 0; val correctByLabel = new Counter[String]; var correctSameHead = 0; diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala index fd9cd40..911ba9c 100644 --- a/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala +++ b/src/main/java/edu/berkeley/nlp/entity/ner/NESentenceMunger.scala @@ -2,13 +2,13 @@ package edu.berkeley.nlp.entity.ner import edu.berkeley.nlp.entity.ConllDocReader import edu.berkeley.nlp.futile.util.Logger -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer import edu.berkeley.nlp.futile.fig.basic.IOUtils object NESentenceMunger { - def writeSentences(file: String, docs: Seq[ConllDoc]) { + def writeSentences(file: String, docs: Seq[Document]) { val out = IOUtils.openOutHard(file); for (doc <- docs; words <- doc.words) { out.println(words.foldLeft("")(_ + " " + _).trim); diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala index 1b7a40f..e73e7c2 100644 --- a/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala +++ b/src/main/java/edu/berkeley/nlp/entity/ner/NerPruner.scala @@ -2,7 +2,7 @@ package edu.berkeley.nlp.entity.ner import scala.collection.mutable.HashMap import edu.berkeley.nlp.entity.coref.UID -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.GUtil import edu.berkeley.nlp.futile.fig.basic.Indexer import edu.berkeley.nlp.entity.Driver @@ -10,14 +10,14 @@ import edu.berkeley.nlp.futile.util.Logger trait NerPruner { - def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]]; + def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]]; } @SerialVersionUID(1L) class NerPrunerFromModel(val nerModel: NerSystemLabeled, val pruningThreshold: Double) extends NerPruner with Serializable { - def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = { + def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = { val sentMarginals = nerModel.computeLogMarginals(doc.words(sentIdx).toArray, doc.pos(sentIdx).toArray); NerPruner.pruneFromMarginals(sentMarginals, nerModel.labelIndexer, pruningThreshold); } @@ -28,7 +28,7 @@ class NerPrunerFromMarginals(val nerMarginals: HashMap[UID,Seq[Array[Array[Float val neLabelIndexer: Indexer[String], val pruningThreshold: Double) extends NerPruner with Serializable { - def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = { + def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = { require(nerMarginals.contains(doc.uid), "Doc ID " + doc.uid + " doesn't have precomputed NER marginals" + " and the NER pruner in this model is configured to rely on these. You need to either change" + " how you specify the pruner (if training) or use a different model entirely (if testing)"); @@ -42,7 +42,7 @@ class NerPrunerFromMarginalsAndModel(val nerMarginals: HashMap[UID,Seq[Array[Arr val nerModel: NerSystemLabeled, val pruningThreshold: Double) extends NerPruner with Serializable { - def pruneSentence(doc: ConllDoc, sentIdx: Int): Array[Array[String]] = { + def pruneSentence(doc: Document, sentIdx: Int): Array[Array[String]] = { val sentMarginals = if (nerMarginals.contains(doc.uid)) { nerMarginals(doc.uid)(sentIdx) } else { diff --git a/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala b/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala index 2d1bb7a..7cf1b43 100644 --- a/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala +++ b/src/main/java/edu/berkeley/nlp/entity/ner/NerSystemLabeled.scala @@ -2,11 +2,10 @@ package edu.berkeley.nlp.entity.ner import edu.berkeley.nlp.futile.fig.basic.Indexer import scala.collection.mutable.ArrayBuffer import scala.collection.JavaConverters._ -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity._ import edu.berkeley.nlp.futile.classify.GeneralLogisticRegression import edu.berkeley.nlp.entity.coref.CorefSystem import edu.berkeley.nlp.futile.util.Logger -import edu.berkeley.nlp.entity.GUtil import edu.berkeley.nlp.futile.classify.SequenceExample import edu.berkeley.nlp.futile.fig.basic.IOUtils import java.io.FileInputStream @@ -15,12 +14,9 @@ import java.io.File import java.io.FileOutputStream import java.io.ObjectOutputStream import edu.berkeley.nlp.futile.util.Counter -import edu.berkeley.nlp.entity.Chunk import scala.collection.mutable.HashMap -import edu.berkeley.nlp.entity.ConllDocReader import edu.berkeley.nlp.entity.lang.Language import scala.util.Random -import edu.berkeley.nlp.entity.ConllDocWriter import edu.berkeley.nlp.math.SloppyMath import edu.berkeley.nlp.entity.wiki.WikipediaInterface import edu.berkeley.nlp.entity.coref.UID @@ -194,7 +190,8 @@ object NerSystemLabeled { // transitionMatrix.map(_.map(arr => if (arr != null) arr.map(featureIndexer.getIndex(_)) else null)); // } - def replaceNer(doc: ConllDoc, newChunks: Seq[Seq[Chunk[String]]]) = { + def replaceNer(doc: Document, newChunks: Seq[Seq[Chunk[String]]]) = { + // MFL TODO: ?? need to make it work either way? new ConllDoc(doc.docID, doc.docPartNo, doc.words, doc.pos, doc.trees, newChunks, doc.corefChunks, doc.speakers); } @@ -227,7 +224,7 @@ object NerSystemLabeled { // TRAINING - def trainNerSystem(trainDocs: Seq[ConllDoc], + def trainNerSystem(trainDocs: Seq[Document], maybeBrownClusters: Option[Map[String,String]], nerFeatureSet: Set[String], reg: Double, @@ -267,7 +264,7 @@ object NerSystemLabeled { // EVALUATION - def evaluateNerSystem(nerSystem: NerSystemLabeled, testDocs: Seq[ConllDoc]) { + def evaluateNerSystem(nerSystem: NerSystemLabeled, testDocs: Seq[Document]) { val labelIndexer = nerSystem.labelIndexer; Logger.logss("Extracting test examples"); val testExamples = extractNerChunksFromConll(testDocs); @@ -332,7 +329,7 @@ object NerSystemLabeled { } } - def extractNerChunksFromConll(docs: Seq[ConllDoc]): Seq[NerExample] = { + def extractNerChunksFromConll(docs: Seq[Document]): Seq[NerExample] = { val chunkTypeCounts = new Counter[String]; val examples = docs.flatMap(doc => { val chunksToUse = doc.nerChunks diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala index 19ac409..9e8ee9e 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/Reprocessor.scala @@ -1,7 +1,7 @@ package edu.berkeley.nlp.entity.preprocess import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import java.io.PrintWriter diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala index 8ac70d1..85c7a97 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/SentenceSplitter.scala @@ -1,5 +1,5 @@ package edu.berkeley.nlp.entity.preprocess -import edu.berkeley.nlp.entity.ConllDoc +import edu.berkeley.nlp.entity.Document import edu.berkeley.nlp.entity.coref.CorefSystem import scala.io.Source import scala.collection.mutable.ArrayBuffer @@ -99,8 +99,8 @@ object SentenceSplitter { def featurize(featureIndexer: Indexer[String], addToIndexer: Boolean): Array[Int] = { val featStrs = new ArrayBuffer[String]; - val pw = prevWord; - val fw = followingWord; + val pw = if(prevWord.isEmpty) " " else prevWord + val fw = if (followingWord.isEmpty) " " else followingWord val fwcls = (if (Character.isUpperCase(fw.charAt(0))) "UC" else if (Character.isLowerCase(fw.charAt(0))) "LC" else if (!Character.isLetterOrDigit(fw.charAt(0))) "PU" else "OTHER"); featStrs += ("Bias=1"); featStrs += ("LastChar=" + pw.last); @@ -242,7 +242,7 @@ object SentenceSplitter { } - private def readExamplesFromConll(docs: Seq[ConllDoc]): Seq[SplitExample] = { + private def readExamplesFromConll(docs: Seq[Document]): Seq[SplitExample] = { // N.B. we only loop up until size - 1 since the end of the last sentence // has no following context and isn't a good training example. // We extract pretty much all positives except for really weird stuff. diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala index 88f9ff2..69fd469 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala @@ -5,9 +5,8 @@ import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser -import edu.berkeley.nlp.entity.ConllDocReader +import edu.berkeley.nlp.entity.{WikiDocReader, ConllDocReader, GUtil} import edu.berkeley.nlp.entity.coref.CorefDocAssembler -import edu.berkeley.nlp.entity.GUtil import edu.berkeley.nlp.entity.coref.Mention import edu.berkeley.nlp.entity.coref.MentionPropertyComputer import edu.berkeley.nlp.entity.lang.Language @@ -221,11 +220,15 @@ object WikipediaInterface { } else if (WikipediaInterface.mentionType == "ontonotes") { // OntoNotes: use only auto_conll and pred mentions ConllDocReader.loadRawConllDocsWithSuffix(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer)); + } else if (WikipediaInterface.mentionType == "wikipedia") { + WikiDocReader.loadRawWikiDocs(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer)) } else { throw new RuntimeException("Unrecognized mention type: " + WikipediaInterface.mentionType); } }); // val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet; + + // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents. val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr)).toSet; Logger.logss("Extracted " + queries.size + " queries from " + corefDocs.size + " documents"); val interface = if (WikipediaInterface.categoryDBInputPath != "") { From 57c24808623a96f650507cd7a32732960e38cf94 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sat, 21 Feb 2015 20:53:21 -0800 Subject: [PATCH 02/25] trying to build wiki to conll interface similar to the existing raw text processor --- .gitignore | 3 +- .../berkeley/nlp/entity/WikiDocReader.scala | 6 +- .../preprocess/PreprocessingDriver.java | 6 +- .../entity/preprocess/WikiPreprocessor.scala | 167 ++++++++++++++++++ 4 files changed, 177 insertions(+), 5 deletions(-) create mode 100644 src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala diff --git a/.gitignore b/.gitignore index d7ce67f..99fe3d9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ expers/ models/ project/project/ project/target/ -target/ \ No newline at end of file +target/ +specify_execDir/ diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala index 0896864..8865ae4 100644 --- a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala +++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala @@ -34,12 +34,12 @@ class WikiDocReader (val lang : Language, val betterParsesFile : String = "") { //val splits = sentenceSplitter.formCanonicalizedParagraphs(document.split(" "), false, false) val splits = sentenceSplitter.splitSentences(document.split("\n").filter(!_.trim.isEmpty)) - + for(reference <- refxml \ "ReferenceInstance") { val surfaceForm = (reference \ "SurfaceForm")(0).text.trim - val offset = (reference \ "offset")(0).text.trim.toInt - val length = (reference \ "length")(0).text.trim.toInt + val offset = (reference \ "Offset")(0).text.trim.toInt + val length = (reference \ "Length")(0).text.trim.toInt val chosenAnnotation = (reference \ "ChosenAnnotation")(0).text.trim val annotatorId = (reference \ "AnnotatorId")(0).text.trim val annotation = (reference \ "Annotation")(0).text.trim diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java b/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java index 1d3a0d7..78fba09 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/PreprocessingDriver.java @@ -12,6 +12,7 @@ import edu.berkeley.nlp.PCFGLA.TreeAnnotations; import edu.berkeley.nlp.entity.ConllDocJustWords; import edu.berkeley.nlp.entity.ConllDocReader; +import edu.berkeley.nlp.entity.WikiDocReader; import edu.berkeley.nlp.entity.lang.Language; import edu.berkeley.nlp.entity.ner.NerSystemLabeled; import edu.berkeley.nlp.futile.fig.basic.IOUtils; @@ -92,7 +93,7 @@ public class PreprocessingDriver implements Runnable { public static boolean useAlternateTokenizer = false; public static enum Mode { - RAW_TEXT, CONLL_JUST_WORDS, REDO_CONLL; + RAW_TEXT, CONLL_JUST_WORDS, REDO_CONLL, WIKILIMITED; } public static void main(String[] args) { @@ -128,6 +129,9 @@ public void run() { Logger.logss("Processed document " + docName + " and wrote result to " + outputDir); } writer.close(); + } else if (mode == Mode.WIKILIMITED) { + WikiDocReader docReader = new WikiDocReader(Language.ENGLISH, ""); + WikiPreprocessor.processesDocs(inputDir + "/", outputDir + "/", docReader, splitter, parser, backoffParser, nerSystem); } else { ConllDocReader docReader = new ConllDocReader(Language.ENGLISH, ""); for (File inputFile : new File(inputDir).listFiles()) { diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala new file mode 100644 index 0000000..e6ed112 --- /dev/null +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -0,0 +1,167 @@ +package edu.berkeley.nlp.entity.preprocess + +import java.io.File + +import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser +import edu.berkeley.nlp.entity.{Chunk, WikiDocReader} +import edu.berkeley.nlp.entity.ner.NerSystemLabeled +import edu.berkeley.nlp.futile.util.Logger +import edu.berkeley.nlp.syntax.Tree + +import scala.xml._ +import scala.concurrent._ +import scala.collection.JavaConverters._ + +import ExecutionContext.Implicits.global + +/** + * Created by matthew on 2/21/15. + */ +object WikiPreprocessor { + + def processesDocs (inputDir : String, outputDir : String, + docReader : WikiDocReader, + splitter : SentenceSplitter, + parser : CoarseToFineMaxRuleParser, + backoffParser : CoarseToFineMaxRuleParser, + nerSystem : NerSystemLabeled) = { + new File(inputDir).listFiles.map(file => { + val input_file = file.getAbsolutePath + val output_file = outputDir + file.getName + //Future { + process(input_file, output_file, docReader, splitter, parser, backoffParser, nerSystem) + //} + })//.foreach(Await.result(_, duration.Duration.Inf)) + } + + def process(inputFile : String, outputFile : String, + docReader : WikiDocReader, + splitter : SentenceSplitter, + parser : CoarseToFineMaxRuleParser, + backoffParser : CoarseToFineMaxRuleParser, + nerSystem : NerSystemLabeled) = { + /*String docName = inputPath; + String[] lines = IOUtils.readLinesHard(inputPath).toArray(new String[0]); + String[] canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(lines, respectInputLineBreaks, respectInputTwoLineBreaks); + String[] sentences = null; + if (skipSentenceSplitting) { + sentences = canonicalizedParagraphs; + } else { + sentences = splitter.splitSentences(canonicalizedParagraphs); + } + String[][] tokenizedSentences = (useAlternateTokenizer ? splitter.tokenizeAlternate(sentences) : splitter.tokenize(sentences)); + Logger.logss("Document " + docName + " contains " + lines.length + " lines and " + tokenizedSentences.length + " sentences"); + String[][] docConllLines = renderDocConllLines(docName, tokenizedSentences, parser, backoffParser, nerSystem); + writeConllLines(docName, docConllLines, outputPath); +*/ + + /* + String[][] conllLines = new String[tokenizedSentences.length][]; + for (int sentIdx = 0; sentIdx < tokenizedSentences.length; sentIdx++) { + String[] tokenizedSentence = tokenizedSentences[sentIdx]; + Tree parse = parse(parser, backoffParser, Arrays.asList(tokenizedSentence)); + if (parse.getYield().size() != tokenizedSentence.length) { + Logger.logss("WARNING: couldn't parse sentence, dropping it: " + Arrays.toString(tokenizedSentence)); + Logger.logss(" (This will be fixed to backing off to an X-bar grammar in a future release)"); + } else { + String[] posTags = new String[tokenizedSentence.length]; + List preterminals = parse.getPreTerminalYield(); + for (int i = 0; i < preterminals.size(); i++) { + posTags[i] = preterminals.get(i); + } + String[] nerBioLabels = null; + if (nerSystem != null) { + nerBioLabels = nerSystem.tagBIO(tokenizedSentence, posTags); + } else { + nerBioLabels = new String[tokenizedSentence.length]; + Arrays.fill(nerBioLabels, "O"); + } + conllLines[sentIdx] = renderSentenceConllLines(docName, 0, tokenizedSentence, posTags, parse, nerBioLabels); + } + } + return conllLines; + + */ + + Logger.logss("starting processing of " + inputFile) + val referencesFile = inputFile.replace("RawTexts", "Problems") + val refxml = XML.loadFile(referencesFile) + val document = scala.io.Source.fromFile(inputFile).mkString.split("\n") + + val references = (refxml \ "ReferenceInstance").map(r => ( + (r \ "SurfaceForm")(0).text.trim, + (r \ "Offset")(0).text.trim.toInt, + (r \ "Length")(0).text.trim.toInt, + (r \ "ChosenAnnotation")(0).text.trim, + (r \ "AnnotatorId")(0).text.trim, + (r \ "Annotation")(0).text.trim + )) + + val canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(document, false, false) + val sentences = splitter.splitSentences(canonicalizedParagraphs) + val tokens = SentenceSplitter.tokenize(sentences) + + + val doclenratio = sentences.map(_.size).sum.toFloat / document.map(_.size + 1).sum + def refFinder (ref : (String, Int, Int, String, String, String)) : (Int, Chunk[String]) = { + val d = doclenratio * (ref._2 + ref._3 / 2.0) + var cnt = 0 + val wrds = ref._1.replace(" ", "") + def rank_match(i : Int, j : Int) : Double = { + val res = tokens(i).drop(j).reduce(_+_) + for(q <- 0 until Math.min(wrds.size, res.size)) { + if (res(q) != wrds(q)) + return q.toDouble / wrds.size + } + 1.0 + } + for(i <- 0 to sentences.size) { + cnt += sentences(i).size + if(cnt > d) { + // assume that the reference is in this sentence + var ll = cnt - sentences(i).size + d // estimated place in sentence + var tcnt = 0 + var best_start = 0 + var best_rank = Double.NegativeInfinity + + for(j <- 0 until tokens(i).size) { + val r = rank_match(i,j) * Math.abs(ll - tcnt) // try and make the item close to where it should be + if(r > best_rank) { + best_start = j + best_rank = r + } + tcnt += tokens(i)(j).size + } + var len = 0 + var len_cnt = 0 + for(j <- best_start until tokens(i).size; if len_cnt < wrds.size) { + len_cnt += tokens(i)(j).size + len += 1 + } + return (i, new Chunk(best_start, best_start + len, ref._4)) + } + } + (-1, null) + } + + val refplaces = references.map(refFinder) + + val refsorted = refplaces.foldLeft(Map[Int, List[Chunk[String]]]().withDefaultValue(List()))((m, itm) => { + if(itm._1 != -1) { + m.updated(itm._1, m(itm._1) :+ itm._2) + } else + m + }) + + val parses : Array[Tree[String]] = tokens.map(t => PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava)) + // ... filter out the ones where the parses don't match, idk how that is going to effect + var tps = (tokens zip parses).filter((t) => t._1.length == t._2.getYield.size) + + Logger.logss("done with "+inputFile) + + + + + } + +} From 9b050c6c909bddf5c2a4a924521f09aa594dab5f Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Tue, 24 Feb 2015 18:16:22 -0800 Subject: [PATCH 03/25] -__- --- .../edu/berkeley/nlp/entity/Document.scala | 4 +- .../edu/berkeley/nlp/entity/WikiDoc.scala | 4 +- .../entity/preprocess/WikiPreprocessor.scala | 76 ++++++++++++++++++- 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/Document.scala b/src/main/java/edu/berkeley/nlp/entity/Document.scala index 8a2ef9d..44555df 100644 --- a/src/main/java/edu/berkeley/nlp/entity/Document.scala +++ b/src/main/java/edu/berkeley/nlp/entity/Document.scala @@ -8,13 +8,13 @@ trait Document { def docPartNo : Int // arrays of words in each sentence including punc def words : Seq[Seq[String]] - // the gram types of the words + // the grammar types of the words def pos : Seq[Seq[String]] // parse trees of each sentence def trees : Seq[DepConstTree] // I am guessing the type of the chunk eg: ORG-NAM def nerChunks : Seq[Seq[Chunk[String]]] - // have ranges and identifiers for the unique item that they are referenceing + // have ranges and identifiers for the unique item that they are referencing // appears [start, end) def corefChunks : Seq[Seq[Chunk[Int]]] // just use "-" for each in the case that the speaker is unknown diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala index bcec448..343703b 100644 --- a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala +++ b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala @@ -10,7 +10,8 @@ case class WikiDoc (docID : String, trees: Seq[DepConstTree], nerChunks : Seq[Seq[Chunk[String]]], corefChunks : Seq[Seq[Chunk[Int]]], - speakers : Seq[Seq[String]] ) extends Document { + speakers : Seq[Seq[String]], + wikiRefChunks : Seq[Seq[Chunk[String]]] ) extends Document { override val numSents = words.size; @@ -30,5 +31,6 @@ case class WikiDoc (docID : String, override def getCorrespondingNERChunk(sentIdx: Int, headIdx: Int): Option[Chunk[String]] = None; + //override def corefChunks = throw new NotImplementedError() } diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala index e6ed112..bf7a7e3 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -3,11 +3,14 @@ package edu.berkeley.nlp.entity.preprocess import java.io.File import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser -import edu.berkeley.nlp.entity.{Chunk, WikiDocReader} +import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder +import edu.berkeley.nlp.entity.{DepConstTree, WikiDoc, Chunk, WikiDocReader} import edu.berkeley.nlp.entity.ner.NerSystemLabeled import edu.berkeley.nlp.futile.util.Logger import edu.berkeley.nlp.syntax.Tree +import edu.berkeley.nlp.futile.fig.basic.Indexer +import scala.collection.mutable.{ArrayBuffer, ListBuffer} import scala.xml._ import scala.concurrent._ import scala.collection.JavaConverters._ @@ -19,6 +22,8 @@ import ExecutionContext.Implicits.global */ object WikiPreprocessor { + val headFinder = new ModCollinsHeadFinder() + def processesDocs (inputDir : String, outputDir : String, docReader : WikiDocReader, splitter : SentenceSplitter, @@ -40,6 +45,43 @@ object WikiPreprocessor { parser : CoarseToFineMaxRuleParser, backoffParser : CoarseToFineMaxRuleParser, nerSystem : NerSystemLabeled) = { + val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem) + + } + + def wikiToLines(wdoc : WikiDoc) : Seq[Seq[String]] = { + val ret = ListBuffer[Array[String]]() + for(i <- 0 until wdoc.words.size) { + // val rend = PreprocessingDriver.renderSentenceConllLines(wdoc.docID, 0, wdoc.words(i), ) + //ret.append("test") + } + ret.toSeq.map(_.toSeq) + } + + def computeCorefBits[T](cr : Seq[Chunk[T]]) : Array[String] = { + var ret = new Array[String](cr.size) + for(i <- 0 until cr.size) { + var sb = new StringBuilder + for(c <- cr) { + + if(c.start == i) { + sb.append("(") + sb.append(c.label) + } + if(c.end == i + 1) + sb.append(")") + + } + } + ret + } + + def mkWikiDoc(inputFile : String, + docReader : WikiDocReader, + splitter : SentenceSplitter, + parser : CoarseToFineMaxRuleParser, + backoffParser : CoarseToFineMaxRuleParser, + nerSystem : NerSystemLabeled) : WikiDoc = { /*String docName = inputPath; String[] lines = IOUtils.readLinesHard(inputPath).toArray(new String[0]); String[] canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(lines, respectInputLineBreaks, respectInputTwoLineBreaks); @@ -87,6 +129,8 @@ object WikiPreprocessor { val referencesFile = inputFile.replace("RawTexts", "Problems") val refxml = XML.loadFile(referencesFile) val document = scala.io.Source.fromFile(inputFile).mkString.split("\n") + val refname = (refxml \ "ReferenceFileName")(0).text.trim + val references = (refxml \ "ReferenceInstance").map(r => ( (r \ "SurfaceForm")(0).text.trim, @@ -125,7 +169,7 @@ object WikiPreprocessor { var best_rank = Double.NegativeInfinity for(j <- 0 until tokens(i).size) { - val r = rank_match(i,j) * Math.abs(ll - tcnt) // try and make the item close to where it should be + val r = rank_match(i,j) / Math.abs(ll - tcnt) // try and make the item close to where it should be if(r > best_rank) { best_start = j best_rank = r @@ -155,13 +199,37 @@ object WikiPreprocessor { val parses : Array[Tree[String]] = tokens.map(t => PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava)) // ... filter out the ones where the parses don't match, idk how that is going to effect - var tps = (tokens zip parses).filter((t) => t._1.length == t._2.getYield.size) + val tps = (tokens, parses, 0 until tokens.size).zipped + .filter((a,b,c) => a.length == b.getYield.size) - Logger.logss("done with "+inputFile) + //val indexer = new Indexer[String]() + val pos = tps._2.map(t => { new ArrayBuffer[String] ++ t.getPreTerminalYield.asScala }) + val trees = for(i <- 0 until tps._1.size) yield { + val childParentMap = DepConstTree.extractDependencyStructure(tps._2(i), headFinder) + new DepConstTree(tps._2(i), pos(i), tps._1(i), childParentMap) + } + val wikiDoc = new WikiDoc( + docID=inputFile, + docPartNo=refname.toInt, + words=tps._1.toSeq.map(_.toSeq), + pos=null, // todo + trees=tps._2.toSeq.map(t => { + new DepConstTree(t, ) + }), + nerChunks=null, // todo + corefChunks=tps._3.map(i => { + refsorted(i).map(_.hashCode).asInstanceOf[Seq[Int]] + }).asInstanceOf[Seq[Seq[Int]]], + speakers=null, + wikiRefChunks=tps._3.map(refsorted(_)) + ) + + Logger.logss("done with "+inputFile) + wikiDoc } } From deab92afc07048f417678d7f1bfdc78bf9648b79 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Tue, 24 Feb 2015 18:42:28 -0800 Subject: [PATCH 04/25] wiki doc appears to be correctly put together now --- .../entity/preprocess/WikiPreprocessor.scala | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala index bf7a7e3..92b475a 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -7,7 +7,7 @@ import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder import edu.berkeley.nlp.entity.{DepConstTree, WikiDoc, Chunk, WikiDocReader} import edu.berkeley.nlp.entity.ner.NerSystemLabeled import edu.berkeley.nlp.futile.util.Logger -import edu.berkeley.nlp.syntax.Tree +import edu.berkeley.nlp.futile.syntax.Tree import edu.berkeley.nlp.futile.fig.basic.Indexer import scala.collection.mutable.{ArrayBuffer, ListBuffer} @@ -197,12 +197,13 @@ object WikiPreprocessor { m }) - val parses : Array[Tree[String]] = tokens.map(t => PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava)) + val parses : Array[Tree[String]] = tokens.map(t => Reprocessor.convertToFutileTree( + PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava))) // ... filter out the ones where the parses don't match, idk how that is going to effect val tps = (tokens, parses, 0 until tokens.size).zipped .filter((a,b,c) => a.length == b.getYield.size) - //val indexer = new Indexer[String]() + val indexer = new Indexer[String]() val pos = tps._2.map(t => { new ArrayBuffer[String] ++ t.getPreTerminalYield.asScala }) @@ -211,19 +212,19 @@ object WikiPreprocessor { new DepConstTree(tps._2(i), pos(i), tps._1(i), childParentMap) } + val empty = tps._1.map(l => (0 until l.length).map(a=>"-")).toSeq + val wikiDoc = new WikiDoc( docID=inputFile, docPartNo=refname.toInt, words=tps._1.toSeq.map(_.toSeq), - pos=null, // todo - trees=tps._2.toSeq.map(t => { - new DepConstTree(t, ) - }), - nerChunks=null, // todo + pos=pos, + trees=trees, + nerChunks=tps._1.map(a=>Seq()), // todo corefChunks=tps._3.map(i => { - refsorted(i).map(_.hashCode).asInstanceOf[Seq[Int]] - }).asInstanceOf[Seq[Seq[Int]]], - speakers=null, + refsorted(i).map(c => new Chunk(c.start, c.end, indexer.getIndex(c.label))) + }), + speakers=empty, // todo? wikiRefChunks=tps._3.map(refsorted(_)) ) From 3fdd5698f8101a6f865c6fa968d67b2a26ab8487 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sat, 28 Feb 2015 16:20:31 -0800 Subject: [PATCH 05/25] seems to be generating the correct output --- .../entity/preprocess/WikiPreprocessor.scala | 106 ++++++++++++++---- 1 file changed, 87 insertions(+), 19 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala index 92b475a..82c75ea 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -9,6 +9,7 @@ import edu.berkeley.nlp.entity.ner.NerSystemLabeled import edu.berkeley.nlp.futile.util.Logger import edu.berkeley.nlp.futile.syntax.Tree import edu.berkeley.nlp.futile.fig.basic.Indexer +import edu.berkeley.nlp.futile.fig.basic.IOUtils import scala.collection.mutable.{ArrayBuffer, ListBuffer} import scala.xml._ @@ -46,36 +47,103 @@ object WikiPreprocessor { backoffParser : CoarseToFineMaxRuleParser, nerSystem : NerSystemLabeled) = { val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem) + val lines = wikiToConllLines(wdoc) + val wlines = wikiToWikiLines(wdoc) + //PreprocessingDriver.writeConllLines(wdoc.docID, lines.map(_.toArray).toArray, outputFile) + writeWikiLines(wdoc.docID, lines, outputFile) + writeWikiLines(wdoc.docID, wlines, outputFile.replace("raw", "wiki")) + } + def writeWikiLines(docID : String, lines : Seq[Seq[String]], outputFile : String) = { + var writer = IOUtils.openOutHard(outputFile) + writer.println("#begin document (" + docID + "); part 000") + lines.foreach(l => { + l.foreach(writer.println(_)) + writer.println + }) + writer.close() } - def wikiToLines(wdoc : WikiDoc) : Seq[Seq[String]] = { - val ret = ListBuffer[Array[String]]() - for(i <- 0 until wdoc.words.size) { - // val rend = PreprocessingDriver.renderSentenceConllLines(wdoc.docID, 0, wdoc.words(i), ) - //ret.append("test") + def wikiToConllLines(wdoc : WikiDoc) : Seq[Seq[String]] = { + val ret = ListBuffer[Seq[String]]() + //ret.append("#begin document (" + wdoc.docID + "); part " + wdoc.docPartNo) + for(i <- 0 until wdoc.numSents) { + val parseBits = PreprocessingDriver.computeParseBits(Reprocessor.convertFromFutileTree(wdoc.trees(i).constTree)) + //val nerBits = PreprocessingDriver.computeNerBits(wdoc.nerChunks(i).toArray) + val corefBits = computeBits(wdoc.corefChunks(i), wdoc.words(i).size) + var lines = new ListBuffer[String]() + // conll: [doc name] [part num] [word num] [word] [pos] [parsebit] [6] [7] [8] [speakers] [nerbit] [corefbit] + for(j <- 0 until wdoc.words(i).size) { + lines.append(wdoc.docID + "\t" + + wdoc.docPartNo + "\t" + + j + "\t" + + wdoc.words(i)(j) + "\t" + + wdoc.pos(i)(j) + "\t" + + parseBits(j) + "\t" + + "\t-\t-\t-\t" + + "-\t" + // speakers + "-\t" + // nerbit + corefBits(j) + "\t" // coref bits + ) + } + ret.append(lines.toSeq) } - ret.toSeq.map(_.toSeq) + ret.toSeq } - def computeCorefBits[T](cr : Seq[Chunk[T]]) : Array[String] = { - var ret = new Array[String](cr.size) - for(i <- 0 until cr.size) { - var sb = new StringBuilder - for(c <- cr) { - - if(c.start == i) { - sb.append("(") - sb.append(c.label) - } - if(c.end == i + 1) - sb.append(")") + def computeBits[T](items : Seq[Chunk[T]], len : Int) : Array[String] = { + var ret = Array.fill(len)(List[String]()) + items.foreach(c => { + if(c.start == c.end -1) { + ret(c.start) = ret(c.start) :+ ("(" + c.label + ")") + } else { + ret(c.start) = ret(c.start) :+ ("(" + c.label) + ret(c.end) = ret(c.end) :+ (c.label + ")") + } + }) + ret.map(i => {if(i.isEmpty) "-" else i.reduce(_+"|"+_)}) + } + def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = { + val ret = ListBuffer[Seq[String]]() + for(i <- 0 until wdoc.numSents) { + val lines = new ListBuffer[String]() + for(j <- 0 until wdoc.words(i).size) { + var s = "" + wdoc.wikiRefChunks(i).foreach(c => { + if(c.start == j) + s = "(" + c.label + }) + s += "*" + wdoc.wikiRefChunks(i).foreach(c => { + if(c.end == j + 1) + s += ")" + }) + lines.append(s) } + ret.append(lines.toSeq) } - ret + ret.toSeq } +// def computeCorefBits[T](cr : Seq[Chunk[T]]) : Array[String] = { +// var ret = new Array[String](cr.size) +// for(i <- 0 until cr.size) { +// var sb = new StringBuilder +// for(c <- cr) { +// +// if(c.start == i) { +// sb.append("(") +// sb.append(c.label) +// } +// if(c.end == i + 1) +// sb.append(")") +// +// } +// } +// ret +// } + def mkWikiDoc(inputFile : String, docReader : WikiDocReader, splitter : SentenceSplitter, From 8636e264930fb64fe8225364c04429738773ed44 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sat, 28 Feb 2015 16:26:30 -0800 Subject: [PATCH 06/25] use threads --- Makefile | 3 +++ .../berkeley/nlp/entity/preprocess/WikiPreprocessor.scala | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index eab9e29..28f361a 100644 --- a/Makefile +++ b/Makefile @@ -12,3 +12,6 @@ aceTester: $(TARGET) queryModel: $(TARGET) java -cp $(TARGET) edu.berkeley.nlp.entity.wiki.QueryChooser -wikiDBPath models/wiki-db-ace.ser.gz + +wikiLimited: $(TARGET) + java -cp $(TARGET) edu.berkeley.nlp.entity.preprocess.PreprocessingDriver ++config/base.conf -inputDir ../WikificationACL2011Data/WikipediaSample/RawTextsTrain/ -outputDir /tmp/gggg/raw/ -mode WIKILIMITED diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala index 82c75ea..624c8d5 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -34,10 +34,10 @@ object WikiPreprocessor { new File(inputDir).listFiles.map(file => { val input_file = file.getAbsolutePath val output_file = outputDir + file.getName - //Future { + Future { process(input_file, output_file, docReader, splitter, parser, backoffParser, nerSystem) - //} - })//.foreach(Await.result(_, duration.Duration.Inf)) + } + }).foreach(Await.result(_, duration.Duration.Inf)) } def process(inputFile : String, outputFile : String, From ce8ab3f4a7fb627bdfd600056932093d9f677fac Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sat, 28 Feb 2015 18:51:10 -0800 Subject: [PATCH 07/25] minor bug fixes --- Makefile | 2 +- .../entity/preprocess/WikiPreprocessor.scala | 26 ++++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 28f361a..61b103e 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ TARGET = target/scala-2.11/berkeley-entity-assembly-1.jar all: $(TARGET) -$(TARGET): $(wildcard src/**) +$(TARGET): $(wildcard src/**/*) sbt assembly aceTester: $(TARGET) diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala index 624c8d5..d872f7f 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -35,7 +35,16 @@ object WikiPreprocessor { val input_file = file.getAbsolutePath val output_file = outputDir + file.getName Future { - process(input_file, output_file, docReader, splitter, parser, backoffParser, nerSystem) + try { + process(input_file, output_file, docReader, splitter, parser.newInstance, backoffParser.newInstance, nerSystem) + } catch { + case e : Exception => { + Logger.logss("failed file: "+input_file) + System.err.print(e.toString) + e.printStackTrace(System.err) + null + } + } } }).foreach(Await.result(_, duration.Duration.Inf)) } @@ -98,7 +107,7 @@ object WikiPreprocessor { ret(c.start) = ret(c.start) :+ ("(" + c.label + ")") } else { ret(c.start) = ret(c.start) :+ ("(" + c.label) - ret(c.end) = ret(c.end) :+ (c.label + ")") + ret(c.end - 1) = ret(c.end - 1) :+ (c.label + ")") } }) ret.map(i => {if(i.isEmpty) "-" else i.reduce(_+"|"+_)}) @@ -265,8 +274,17 @@ object WikiPreprocessor { m }) - val parses : Array[Tree[String]] = tokens.map(t => Reprocessor.convertToFutileTree( - PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava))) + val parses: Array[Tree[String]] = tokens.map(t => { + //try { + Reprocessor.convertToFutileTree( + PreprocessingDriver.parse(parser, backoffParser, t.toList.asJava)) + /*} catch { + case e : java.lang.NullPointerException => { + null; + } + }*/ + }) + // ... filter out the ones where the parses don't match, idk how that is going to effect val tps = (tokens, parses, 0 until tokens.size).zipped .filter((a,b,c) => a.length == b.getYield.size) From 4346d0382712cd8794347539bd4261b9d6a0b164 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sun, 1 Mar 2015 15:26:18 -0800 Subject: [PATCH 08/25] trying to run the wikipedia interface now --- build.sbt | 1 + resources/Messages_de.properties | 9 + resources/Messages_en.properties | 36 ++ resources/Messages_es.properties | 8 + resources/Messages_fr.properties | 8 + resources/Messages_it.properties | 8 + resources/Messages_pt_BR.properties | 38 ++ resources/interwiki.properties | 392 ++++++++++++++++++ resources/operators.txt | 27 ++ .../berkeley/nlp/entity/ConllDocReader.scala | 15 +- .../berkeley/nlp/entity/WikiDocReader.scala | 35 -- .../entity/preprocess/WikiPreprocessor.scala | 61 +-- .../nlp/entity/wiki/WikipediaInterface.scala | 2 +- 13 files changed, 541 insertions(+), 99 deletions(-) create mode 100644 resources/Messages_de.properties create mode 100644 resources/Messages_en.properties create mode 100644 resources/Messages_es.properties create mode 100644 resources/Messages_fr.properties create mode 100644 resources/Messages_it.properties create mode 100644 resources/Messages_pt_BR.properties create mode 100644 resources/interwiki.properties create mode 100644 resources/operators.txt diff --git a/build.sbt b/build.sbt index 91a4b9b..a3fe7b7 100644 --- a/build.sbt +++ b/build.sbt @@ -10,3 +10,4 @@ assemblySettings mainClass in assembly := Some("edu.berkeley.nlp.entity.Driver") +unmanagedResourceDirectories in Compile += { baseDirectory.value / "resources/" } diff --git a/resources/Messages_de.properties b/resources/Messages_de.properties new file mode 100644 index 0000000..51b38e9 --- /dev/null +++ b/resources/Messages_de.properties @@ -0,0 +1,9 @@ +wiki.tags.toc.content=Inhaltsverzeichnis +wiki.api.url=http://de.wikipedia.org/w/api.php +wiki.api.category1=Kategorie +wiki.api.image1=Datei +wiki.api.image2=Bild +wiki.api.template1=Vorlage +wiki.api.category2=Category +wiki.api.image2=Image +wiki.api.template2=Template \ No newline at end of file diff --git a/resources/Messages_en.properties b/resources/Messages_en.properties new file mode 100644 index 0000000..6b9e2f0 --- /dev/null +++ b/resources/Messages_en.properties @@ -0,0 +1,36 @@ +wiki.tags.toc.content=Contents +wiki.api.url=http://en.wikipedia.org/w/api.php +wiki.api.media1=Media +wiki.api.media2=Media +wiki.api.special1=Special +wiki.api.special2=Special +wiki.api.talk1=Talk +wiki.api.talk2=Talk +wiki.api.user1=User +wiki.api.user2=User +wiki.api.usertalk1=User_talk +wiki.api.usertalk2=User_talk +wiki.api.meta1=Meta +wiki.api.meta2=Meta +wiki.api.metatalk1=Meta_talk +wiki.api.metatalk2=Meta_talk +wiki.api.image1=Image +wiki.api.image2=File +wiki.api.imagetalk1=Image_talk +wiki.api.imagetalk2=File_talk +wiki.api.mediawiki1=MediaWiki +wiki.api.mediawiki2=MediaWiki +wiki.api.mediawikitalk1=MediaWiki_talk +wiki.api.mediawikitalk2=MediaWiki_talk +wiki.api.template1=Template +wiki.api.template2=Template +wiki.api.templatetalk1=Template_talk +wiki.api.templatetalk2=Template_talk +wiki.api.help1=Help +wiki.api.help2=Help +wiki.api.helptalk1=Help_talk +wiki.api.helptalk2=Help_talk +wiki.api.category1=Category +wiki.api.category2=Category +wiki.api.categorytalk1=Category_talk +wiki.api.categorytalk2=Category_talk \ No newline at end of file diff --git a/resources/Messages_es.properties b/resources/Messages_es.properties new file mode 100644 index 0000000..bc50428 --- /dev/null +++ b/resources/Messages_es.properties @@ -0,0 +1,8 @@ +wiki.tags.toc.content=Contenido +wiki.api.url=http://es.wikipedia.org/w/api.php +wiki.api.category1=Categor\u00EDa +wiki.api.image1=Imagen +wiki.api.template1=Plantilla +wiki.api.category2=Category +wiki.api.image2=Image +wiki.api.template2=Template diff --git a/resources/Messages_fr.properties b/resources/Messages_fr.properties new file mode 100644 index 0000000..2a76842 --- /dev/null +++ b/resources/Messages_fr.properties @@ -0,0 +1,8 @@ +wiki.tags.toc.content=Sommaire +wiki.api.url=http://fr.wikipedia.org/w/api.php +wiki.api.category1=Cat\u00E9gorie +wiki.api.image1=Image +wiki.api.template1=Mod\u00E8le +wiki.api.category2=Category +wiki.api.image2=Image +wiki.api.template2=Template \ No newline at end of file diff --git a/resources/Messages_it.properties b/resources/Messages_it.properties new file mode 100644 index 0000000..97778a3 --- /dev/null +++ b/resources/Messages_it.properties @@ -0,0 +1,8 @@ +wiki.tags.toc.content=Indice +wiki.api.url=http://it.wikipedia.org/w/api.php +wiki.api.category1=Categoria +wiki.api.image1=Immagine +wiki.api.template1=Template +wiki.api.category2=Category +wiki.api.image2=File +wiki.api.template2=Template \ No newline at end of file diff --git a/resources/Messages_pt_BR.properties b/resources/Messages_pt_BR.properties new file mode 100644 index 0000000..e0baaf7 --- /dev/null +++ b/resources/Messages_pt_BR.properties @@ -0,0 +1,38 @@ +#Generated by ResourceBundle Editor (http://eclipse-rbe.sourceforge.net) + +wiki.api.category1 = Categoria +wiki.api.category2 = Categoria +wiki.api.categorytalk1 = Categoria_falar +wiki.api.categorytalk2 = Categoria_falar +wiki.api.help1 = Ajuda +wiki.api.help2 = Ajuda +wiki.api.helptalk1 = Ajuda_falar +wiki.api.helptalk2 = Ajuda_falar +wiki.api.image1 = Imagem +wiki.api.image2 = Arquivo +wiki.api.imagetalk1 = Imagem_falar +wiki.api.imagetalk2 = Arquivo_falar +wiki.api.media1 = M\u00EDdia +wiki.api.media2 = M\u00EDdia +wiki.api.mediawiki1 = MediaWiki +wiki.api.mediawiki2 = MediaWiki +wiki.api.mediawikitalk1 = MediaWiki_falar +wiki.api.mediawikitalk2 = MediaWiki_falar +wiki.api.meta1 = Meta +wiki.api.meta2 = Meta +wiki.api.metatalk1 = Meta_falar +wiki.api.metatalk2 = Meta_falar +wiki.api.special1 = Especial +wiki.api.special2 = Especial +wiki.api.talk1 = Falar +wiki.api.talk2 = Falar +wiki.api.template1 = Modelo +wiki.api.template2 = Modelo +wiki.api.templatetalk1 = Modelo_falar +wiki.api.templatetalk2 = Modelo_falar +wiki.api.url = http://br.wikipedia.org/w/api.php +wiki.api.user1 = Usu\u00E1rio +wiki.api.user2 = Usu\u00E1rio +wiki.api.usertalk1 = Usu\u00E1rio_falar +wiki.api.usertalk2 = Usu\u00E1rio_falar +wiki.tags.toc.content = Conte\u00FAdo diff --git a/resources/interwiki.properties b/resources/interwiki.properties new file mode 100644 index 0000000..b312b1e --- /dev/null +++ b/resources/interwiki.properties @@ -0,0 +1,392 @@ +be-x-old=http://be-x-old.wikipedia.org/wiki/${title} +tavi=http://tavi.sourceforge.net/${title} +xh=http://xh.wikipedia.org/wiki/${title} +lasvegaswiki=http://wiki.gmnow.com/index.php/${title} +pmeg=http://www.bertilow.com/pmeg/${title}.php +warpedview=http://www.warpedview.com/index.php/${title} +slashdot=http://slashdot.org/article.pl?sid=${title} +wikimedia=http://wikimediafoundation.org/wiki/${title} +wikia=http://www.wikia.com/wiki/index.php/${title} +wo=http://wo.wikipedia.org/wiki/${title} +jefo=http://www.esperanto-jeunes.org/vikio/index.php?${title} +openfacts=http://openfacts.berlios.de/index.phtml?title=${title} +lqwiki=http://wiki.linuxquestions.org/wiki/${title} +wa=http://wa.wikipedia.org/wiki/${title} +ciscavate=http://ciscavate.org/index.php/${title} +demokraatia=http://wiki.demokraatia.ee/ +efnetpythonwiki=http://purl.net/wiki/python/${title} +mediazilla=http://bugzilla.wikipedia.org/${title} +wikiquote=http://en.wikiquote.org/wiki/${title} +jbo=http://jbo.wikipedia.org/wiki/${title} +vo=http://vo.wikipedia.org/wiki/${title} +vi=http://vi.wikipedia.org/wiki/${title} +gamewiki=http://gamewiki.org/wiki/index.php/${title} +hewikisource=http://he.wikisource.org/wiki/${title} +ve=http://ve.wikipedia.org/wiki/${title} +google=http://www.google.com/search?q=${title} +uz=http://uz.wikipedia.org/wiki/${title} +drumcorpswiki=http://www.drumcorpswiki.com/index.php/${title} +nah=http://nah.wikipedia.org/wiki/${title} +ur=http://ur.wikipedia.org/wiki/${title} +jiniwiki=http://www.cdegroot.com/cgi-bin/jini?${title} +uk=http://uk.wikipedia.org/wiki/${title} +ug=http://ug.wikipedia.org/wiki/${title} +osi=reference model=http://wiki.tigma.ee/ +mbtest=http://www.usemod.com/cgi-bin/mbtest.pl?${title} +disinfopedia=http://www.disinfopedia.org/wiki.phtml?title=${title} +ty=http://ty.wikipedia.org/wiki/${title} +squeak=http://minnow.cc.gatech.edu/squeak/${title} +tw=http://tw.wikipedia.org/wiki/${title} +tlh=http://tlh.wikipedia.org/wiki/${title} +tt=http://tt.wikipedia.org/wiki/${title} +ts=http://ts.wikipedia.org/wiki/${title} +tr=http://tr.wikipedia.org/wiki/${title} +scoutpedia=http://www.scoutpedia.info/index.php/${title} +minnan=http://zh-min-nan.wikipedia.org/wiki/${title} +to=http://to.wikipedia.org/wiki/${title} +tn=http://tn.wikipedia.org/wiki/${title} +wikinfo=http://www.wikinfo.org/wiki.php?title=${title} +s23wiki=http://is-root.de/wiki/index.php/${title} +tl=http://tl.wikipedia.org/wiki/${title} +aiwiki=http://www.ifi.unizh.ch/ailab/aiwiki/aiw.cgi?${title} +tk=http://tk.wikipedia.org/wiki/${title} +ti=http://ti.wikipedia.org/wiki/${title} +th=http://th.wikipedia.org/wiki/${title} +tg=http://tg.wikipedia.org/wiki/${title} +fr.fr=http://fr.fr.wikinations.org/${title} +te=http://te.wikipedia.org/wiki/${title} +csb=http://csb.wikipedia.org/wiki/${title} +theopedia=http://www.theopedia.com/${title} +ta=http://ta.wikipedia.org/wiki/${title} +acadwiki=http://xarch.tu-graz.ac.at/autocad/wiki/${title} +efnetceewiki=http://purl.net/wiki/c/${title} +phpwiki=http://phpwiki.sourceforge.net/phpwiki/index.php?${title} +tmwiki=http://www.EasyTopicMaps.com/?page=${title} +sw=http://sw.wikipedia.org/wiki/${title} +benefitswiki=http://www.benefitslink.com/cgi-bin/wiki.cgi?${title} +ecxei=http://www.ikso.net/cgi-bin/wiki.pl?${title} +sv=http://sv.wikipedia.org/wiki/${title} +uea=http://www.tejo.org/uea/${title} +su=http://su.wikipedia.org/wiki/${title} +st=http://st.wikipedia.org/wiki/${title} +ss=http://ss.wikipedia.org/wiki/${title} +sr=http://sr.wikipedia.org/wiki/${title} +sq=http://sq.wikipedia.org/wiki/${title} +so=http://so.wikipedia.org/wiki/${title} +sn=http://sn.wikipedia.org/wiki/${title} +sm=http://sm.wikipedia.org/wiki/${title} +sl=http://sl.wikipedia.org/wiki/${title} +sk=http://sk.wikipedia.org/wiki/${title} +cache=http://www.google.com/search?q=cache:${title} +svgwiki=http://www.protocol7.com/svg-wiki/default.asp?${title} +si=http://si.wikipedia.org/wiki/${title} +smikipedia=http://www.smikipedia.org/${title} +simple=http://simple.wikipedia.org/wiki/${title} +sh=http://sh.wikipedia.org/wiki/${title} +sg=http://sg.wikipedia.org/wiki/${title} +gentoo-wiki=http://gentoo-wiki.com/${title} +se=http://se.wikipedia.org/wiki/${title} +webseitzwiki=http://webseitz.fluxent.com/wiki/${title} +sd=http://sd.wikipedia.org/wiki/${title} +sc=http://sc.wikipedia.org/wiki/${title} +jamwiki=http://jamwiki.org/wiki/en/${title} +sa=http://sa.wikipedia.org/wiki/${title} +greencheese=http://www.greencheese.org/${title} +linuxwiki=http://www.linuxwiki.de/${title} +diveintoosx=http://diveintoosx.org/${title} +bridgeswiki=http://c2.com/w2/bridges/${title} +rw=http://rw.wikipedia.org/wiki/${title} +ru=http://ru.wikipedia.org/wiki/${title} +corpknowpedia=http://corpknowpedia.org/wiki/index.php/${title} +echei=http://www.ikso.net/cgi-bin/wiki.pl?${title} +ro=http://ro.wikipedia.org/wiki/${title} +rn=http://rn.wikipedia.org/wiki/${title} +rm=http://rm.wikipedia.org/wiki/${title} +wikispecies=http://species.wikipedia.org/wiki/${title} +webdevwikinl=http://www.promo-it.nl/WebDevWiki/index.php?page=${title} +sourceforge=http://sourceforge.net/${title} +pythonwiki=http://www.pythonwiki.de/${title} +roa-rup=http://roa-rup.wikipedia.org/wiki/${title} +tmnet=http://www.technomanifestos.net/?${title} +gmailwiki=http://www.gmailwiki.com/index.php/${title} +plog4u=http://plog4u.org/index.php/${title} +googlegroups=http://groups.google.com/groups?q=${title} +wikiworld=http://WikiWorld.com/wiki/index.php/${title} +qu=http://qu.wikipedia.org/wiki/${title} +consciousness=http://teadvus.inspiral.org/ +eljwiki=http://elj.sourceforge.net/phpwiki/index.php/${title} +lojban=http://www.lojban.org/tiki/tiki-index.php?page=${title} +usej=http://www.tejo.org/usej/${title} +tokipona=http://tokipona.wikipedia.org/wiki/${title} +mathsongswiki=http://SeedWiki.com/page.cfm?wikiid=237&doc=${title} +got=http://got.wikipedia.org/wiki/${title} +shakti=http://cgi.algonet.se/htbin/cgiwrap/pgd/ShaktiWiki/${title} +memoryalpha=http://www.memory-alpha.org/en/index.php/${title} +cliki=http://ww.telent.net/cliki/${title} +pt=http://pt.wikipedia.org/wiki/${title} +fr.ca=http://fr.ca.wikinations.org/${title} +ps=http://ps.wikipedia.org/wiki/${title} +fur=http://fur.wikipedia.org/wiki/${title} +wikicities=http://www.wikicities.com/index.php/${title} +pl=http://pl.wikipedia.org/wiki/${title} +pi=http://pi.wikipedia.org/wiki/${title} +wiktionary=http://en.wiktionary.org/wiki/${title} +turismo=http://www.tejo.org/turismo/${title} +pa=http://pa.wikipedia.org/wiki/${title} +terrorwiki=http://www.liberalsagainstterrorism.com/wiki/index.php/${title} +finalempire=http://final-empire.sourceforge.net/cgi-bin/wiki.pl?${title} +fr.be=http://fr.wikinations.be/${title} +os=http://os.wikipedia.org/wiki/${title} +or=http://or.wikipedia.org/wiki/${title} +netvillage=http://www.netbros.com/?${title} +seattlewireless=http://seattlewireless.net/?${title} +om=http://om.wikipedia.org/wiki/${title} +pangalacticorg=http://www.pangalactic.org/Wiki/${title} +seeds=http://www.IslandSeeds.org/wiki/${title} +oc=http://oc.wikipedia.org/wiki/${title} +raec=http://www.raec.clacso.edu.ar:8080/raec/Members/raecpedia/${title} +ny=http://ny.wikipedia.org/wiki/${title} +nv=http://nv.wikipedia.org/wiki/${title} +foldoc=http://www.foldoc.org/foldoc/foldoc.cgi?${title} +no=http://no.wikipedia.org/wiki/${title} +nn=http://nn.wikipedia.org/wiki/${title} +metawikipedia=http://meta.wikimedia.org/wiki/${title} +wikif1=http://www.wikif1.org/${title} +nl=http://nl.wikipedia.org/wiki/${title} +ypsieyeball=http://sknkwrks.dyndns.org:1957/writewiki/wiki.pl?${title} +ng=http://ng.wikipedia.org/wiki/${title} +purlnet=http://purl.oclc.org/NET/${title} +ne=http://ne.wikipedia.org/wiki/${title} +nb=http://nb.wikipedia.org/wiki/${title} +abbenormal=http://www.ourpla.net/cgi-bin/pikie.cgi?${title} +na=http://na.wikipedia.org/wiki/${title} +docbook=http://docbook.org/wiki/moin.cgi/${title} +fr.org=http://fr.wikinations.org/${title} +my=http://my.wikipedia.org/wiki/${title} +brasilwiki=http://rio.ifi.unizh.ch/brasilienwiki/index.php/${title} +mt=http://mt.wikipedia.org/wiki/${title} +ms=http://ms.wikipedia.org/wiki/${title} +mr=http://mr.wikipedia.org/wiki/${title} +advogato=http://www.advogato.org/${title} +senseislibrary=http://senseis.xmp.net/?${title} +mo=http://mo.wikipedia.org/wiki/${title} +mn=http://mn.wikipedia.org/wiki/${title} +lutherwiki=http://www.lutheranarchives.com/mw/index.php/${title} +ml=http://ml.wikipedia.org/wiki/${title} +mk=http://mk.wikipedia.org/wiki/${title} +mi=http://mi.wikipedia.org/wiki/${title} +jspwiki=http://www.ecyrd.com/JSPWiki/Wiki.jsp?page=${title} +mh=http://mh.wikipedia.org/wiki/${title} +mg=http://mg.wikipedia.org/wiki/${title} +metaweb=http://www.metaweb.com/wiki/wiki.phtml?title=${title} +kmwiki=http://www.voght.com/cgi-bin/pywiki?${title} +efnetxmlwiki=http://purl.net/wiki/xml/${title} +tejo=http://www.tejo.org/vikio/${title} +zwiki=http://www.zwiki.org/${title} +lv=http://lv.wikipedia.org/wiki/${title} +lt=http://lt.wikipedia.org/wiki/${title} +lo=http://lo.wikipedia.org/wiki/${title} +foxwiki=http://fox.wikis.com/wc.dll?Wiki~${title} +ln=http://ln.wikipedia.org/wiki/${title} +emacswiki=http://www.emacswiki.org/cgi-bin/wiki.pl?${title} +li=http://li.wikipedia.org/wiki/${title} +bemi=http://bemi.free.fr/vikio/index.php?${title} +lg=http://lg.wikipedia.org/wiki/${title} +wikibooks=http://en.wikibooks.org/wiki/${title} +lb=http://lb.wikipedia.org/wiki/${title} +la=http://la.wikipedia.org/wiki/${title} +creationmatters=http://www.ourpla.net/cgi-bin/wiki.pl?${title} +ky=http://ky.wikipedia.org/wiki/${title} +kw=http://kw.wikipedia.org/wiki/${title} +kv=http://kv.wikipedia.org/wiki/${title} +pikie=http://pikie.darktech.org/cgi/pikie?${title} +evowiki=http://www.evowiki.org/index.php/${title} +ku=http://ku.wikipedia.org/wiki/${title} +ks=http://ks.wikipedia.org/wiki/${title} +kr=http://kr.wikipedia.org/wiki/${title} +haribeau=http://wiki.haribeau.de/cgi-bin/wiki.pl?${title} +ko=http://ko.wikipedia.org/wiki/${title} +kn=http://kn.wikipedia.org/wiki/${title} +km=http://km.wikipedia.org/wiki/${title} +kl=http://kl.wikipedia.org/wiki/${title} +kk=http://kk.wikipedia.org/wiki/${title} +kj=http://kj.wikipedia.org/wiki/${title} +ki=http://ki.wikipedia.org/wiki/${title} +why=http://clublet.com/c/c/why?${title} +kg=http://kg.wikipedia.org/wiki/${title} +ka=http://ka.wikipedia.org/wiki/${title} +mus=http://mus.wikipedia.org/wiki/${title} +hrwiki=http://www.hrwiki.org/index.php/${title} +orgpatterns=http://www.bell-labs.com/cgi-user/OrgPatterns/OrgPatterns?${title} +jv=http://jv.wikipedia.org/wiki/${title} +gotamac=http://www.got-a-mac.org/${title} +dolphinwiki=http://www.object-arts.com/wiki/html/Dolphin/${title} +zh-cn=http://zh.wikipedia.org/wiki/${title} +visualworks=http://wiki.cs.uiuc.edu/VisualWorks/${title} +iawiki=http://www.IAwiki.net/${title} +freebsdman=http://www.FreeBSD.org/cgi/man.cgi?apropos=1&query=${title} +ja=http://ja.wikipedia.org/wiki/${title} +chy=http://chy.wikipedia.org/wiki/${title} +unreal=http://wiki.beyondunreal.com/wiki/${title} +iu=http://iu.wikipedia.org/wiki/${title} +it=http://it.wikipedia.org/wiki/${title} +is=http://is.wikipedia.org/wiki/${title} +chr=http://chr.wikipedia.org/wiki/${title} +usemod=http://www.usemod.com/cgi-bin/wiki.pl?${title} +cmwiki=http://www.ourpla.net/cgi-bin/wiki.pl?${title} +hammondwiki=http://www.dairiki.org/HammondWiki/index.php3?${title} +cho=http://cho.wikipedia.org/wiki/${title} +io=http://io.wikipedia.org/wiki/${title} +personaltelco=http://www.personaltelco.net/index.cgi/${title} +ik=http://ik.wikipedia.org/wiki/${title} +haw=http://haw.wikipedia.org/wiki/${title} +ii=http://ii.wikipedia.org/wiki/${title} +wikisource=http://sources.wikipedia.org/wiki/${title} +lugkr=http://lug-kr.sourceforge.net/cgi-bin/lugwiki.pl?${title} +ig=http://ig.wikipedia.org/wiki/${title} +zh-cfr=http://zh-min-nan.wikipedia.org/wiki/${title} +ie=http://ie.wikipedia.org/wiki/${title} +id=http://id.wikipedia.org/wiki/${title} +ia=http://ia.wikipedia.org/wiki/${title} +openwiki=http://openwiki.com/?${title} +hz=http://hz.wikipedia.org/wiki/${title} +hy=http://hy.wikipedia.org/wiki/${title} +strikiwiki=http://ch.twi.tudelft.nl/~mostert/striki/teststriki.pl?${title} +hu=http://hu.wikipedia.org/wiki/${title} +herzkinderwiki=http://www.herzkinderinfo.de/Mediawiki/index.php/${title} +ht=http://ht.wikipedia.org/wiki/${title} +hr=http://hr.wikipedia.org/wiki/${title} +webisodes=http://www.webisodes.org/${title} +globalvoices=http://cyber.law.harvard.edu/dyn/globalvoices/wiki/${title} +ho=http://ho.wikipedia.org/wiki/${title} +hi=http://hi.wikipedia.org/wiki/${title} +elibre=http://enciclopedia.us.es/index.php/${title} +alife=http://news.alife.org/wiki/index.php?${title} +he=http://he.wikipedia.org/wiki/${title} +ast=http://ast.wikipedia.org/wiki/${title} +ha=http://ha.wikipedia.org/wiki/${title} +revo=http://purl.org/NET/voko/revo/art/${title}.html +arxiv=http://www.arxiv.org/abs/${title} +sockwiki=http://wiki.socklabs.com/${title} +gv=http://gv.wikipedia.org/wiki/${title} +gu=http://gu.wikipedia.org/wiki/${title} +gn=http://gn.wikipedia.org/wiki/${title} +gl=http://gl.wikipedia.org/wiki/${title} +seapig=http://www.seapig.org/${title} +gd=http://gd.wikipedia.org/wiki/${title} +ga=http://ga.wikipedia.org/wiki/${title} +opera7wiki=http://nontroppo.org/wiki/${title} +oeis=http://www.research.att.com/cgi-bin/access.cgi/as/njas/sequences/eisA.cgi?Anum=${title} +moinmoin=http://purl.net/wiki/moin/${title} +fy=http://fy.wikipedia.org/wiki/${title} +gej=http://www.esperanto.de/cgi-bin/aktivikio/wiki.pl?${title} +fr=http://fr.wikipedia.org/wiki/${title} +arc=http://arc.wikipedia.org/wiki/${title} +fo=http://fo.wikipedia.org/wiki/${title} +fj=http://fj.wikipedia.org/wiki/${title} +wikinews=http://en.wikinews.org/wiki/${title} +fi=http://fi.wikipedia.org/wiki/${title} +ff=http://ff.wikipedia.org/wiki/${title} +annotationwiki=http://www.seedwiki.com/page.cfm?wikiid=368&doc=${title} +sep11=http://sep11.wikipedia.org/wiki/${title} +wlug=http://www.wlug.org.nz/${title} +fa=http://fa.wikipedia.org/wiki/${title} +eu=http://eu.wikipedia.org/wiki/${title} +tmbw=http://www.tmbw.net/wiki/index.php/${title} +et=http://et.wikipedia.org/wiki/${title} +scn=http://scn.wikipedia.org/wiki/${title} +es=http://es.wikipedia.org/wiki/${title} +muweb=http://www.dunstable.com/scripts/MuWebWeb?${title} +eo=http://eo.wikipedia.org/wiki/${title} +en=http://en.wikipedia.org/wiki/${title} +dejanews=http://www.deja.com/=dnc/getdoc.xp?AN=${title} +el=http://el.wikipedia.org/wiki/${title} +jargonfile=http://sunir.org/apps/meta.pl?wiki=JargonFile&redirect=${title} +eokulturcentro=http://esperanto.toulouse.free.fr/wakka.php?wiki=${title} +ee=http://ee.wikipedia.org/wiki/${title} +tum=http://tum.wikipedia.org/wiki/${title} +plog4u_de=http://plog4u.de/index.php/${title} +dz=http://dz.wikipedia.org/wiki/${title} +dv=http://dv.wikipedia.org/wiki/${title} +kerimwiki=http://wiki.oxus.net/${title} +dk=http://da.wikipedia.org/wiki/${title} +de=http://de.wikipedia.org/wiki/${title} +dwjwiki=http://www.suberic.net/cgi-bin/dwj/wiki.cgi?${title} +da=http://da.wikipedia.org/wiki/${title} +wlwiki=http://winslowslair.supremepixels.net/wiki/index.php/${title} +cy=http://cy.wikipedia.org/wiki/${title} +w=http://en.wikipedia.org/wiki/${title} +cv=http://cv.wikipedia.org/wiki/${title} +cs=http://cs.wikipedia.org/wiki/${title} +cr=http://cr.wikipedia.org/wiki/${title} +q=http://en.wikiquote.org/wiki/${title} +co=http://co.wikipedia.org/wiki/${title} +zh-min-nan=http://zh-min-nan.wikipedia.org/wiki/${title} +n=http://en.wikinews.org/wiki/${title} +m=http://meta.wikimedia.org/wiki/${title} +annotation=http://bayle.stanford.edu/crit/nph-med.cgi/${title} +ch=http://ch.wikipedia.org/wiki/${title} +efnetcppwiki=http://purl.net/wiki/cpp/${title} +ce=http://ce.wikipedia.org/wiki/${title} +c2find=http://c2.com/cgi/wiki?FindPage&value=${title} +b=http://en.wikibooks.org/wiki/${title} +ca=http://ca.wikipedia.org/wiki/${title} +dictionary=http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=${title} +ang=http://ang.wikipedia.org/wiki/${title} +zh-tw=http://zh.wikipedia.org/wiki/${title} +bs=http://bs.wikipedia.org/wiki/${title} +br=http://br.wikipedia.org/wiki/${title} +twiki=http://twiki.org/cgi-bin/view/${title} +bo=http://bo.wikipedia.org/wiki/${title} +wikt=http://en.wiktionary.org/wiki/${title} +bn=http://bn.wikipedia.org/wiki/${title} +bm=http://bm.wikipedia.org/wiki/${title} +bi=http://bi.wikipedia.org/wiki/${title} +bh=http://bh.wikipedia.org/wiki/${title} +bg=http://bg.wikipedia.org/wiki/${title} +knowhow=http://www2.iro.umontreal.ca/~paquetse/cgi-bin/wiki.cgi?${title} +be=http://be.wikipedia.org/wiki/${title} +wiki=http://c2.com/cgi/wiki?${title} +patwiki=http://gauss.ffii.org/${title} +ba=http://ba.wikipedia.org/wiki/${title} +rfc=http://www.rfc-editor.org/rfc/rfc${title}.txt +zu=http://zu.wikipedia.org/wiki/${title} +lanifexwiki=http://opt.lanifex.com/cgi-bin/wiki.pl?${title} +twistedwiki=http://purl.net/wiki/twisted/${title} +az=http://az.wikipedia.org/wiki/${title} +ay=http://ay.wikipedia.org/wiki/${title} +commons=http://commons.wikimedia.org/wiki/${title} +acronym=http://www.acronymfinder.com/af-query.asp?String=exact&Acronym=${title} +av=http://av.wikipedia.org/wiki/${title} +aspienetwiki=http://aspie.mela.de/Wiki/index.php?title=${title} +as=http://as.wikipedia.org/wiki/${title} +metawiki=http://sunir.org/apps/meta.pl?${title} +ar=http://ar.wikipedia.org/wiki/${title} +zh=http://zh.wikipedia.org/wiki/${title} +pywiki=http://www.voght.com/cgi-bin/pywiki?${title} +an=http://an.wikipedia.org/wiki/${title} +am=http://am.wikipedia.org/wiki/${title} +ak=http://ak.wikipedia.org/wiki/${title} +infosecpedia=http://www.infosecpedia.org/pedia/index.php/${title} +za=http://za.wikipedia.org/wiki/${title} +af=http://af.wikipedia.org/wiki/${title} +firstwiki=http://firstwiki.org/index.php/${title} +als=http://als.wikipedia.org/wiki/${title} +ab=http://ab.wikipedia.org/wiki/${title} +aa=http://aa.wikipedia.org/wiki/${title} +ursine=http://ursine.ca/${title} +meatball=http://www.usemod.com/cgi-bin/mb.pl?${title} +mozillawiki=http://wiki.mozilla.org/index.php/${title} +imdb=http://us.imdb.com/Title?${title} +pythoninfo=http://www.python.org/cgi-bin/moinmoin/${title} +yo=http://yo.wikipedia.org/wiki/${title} +seattlewiki=http://seattlewiki.org/wiki/${title} +yi=http://yi.wikipedia.org/wiki/${title} +vls=http://vls.wikipedia.org/wiki/${title} +meta=http://meta.wikimedia.org/wiki/${title} +susning=http://www.susning.nu/${title} +nds=http://nds.wikipedia.org/wiki/${title} +wikitravel=http://wikitravel.org/en/${title} +codersbase=http://www.codersbase.com/${title} +tpi=http://tpi.wikipedia.org/wiki/${title} +ppr=http://c2.com/cgi/wiki?${title} \ No newline at end of file diff --git a/resources/operators.txt b/resources/operators.txt new file mode 100644 index 0000000..7d9835d --- /dev/null +++ b/resources/operators.txt @@ -0,0 +1,27 @@ +pre,-,PreMinus,4600 +pre,+,PrePlus,4600 +pre,not,Not,4600 +# +in,^,Pow,3700 +# +in,*,Times,3800 +in,/,Divide,3800 +in,div,Divide,3800 +in,mod,Mod,3800 +# +in,+,Plus,2900 +in,-,Subtract,2900 +# +in,round,Round,2800 +# +in,=,Equal,2600 +in,!=,Unequal,2600 +in,<>,Unequal,2600 +in,>,Greater,2600 +in,>=,GreaterEqual,2600 +in,<,Less,2600 +in,<=,LessEqual,2600 +# +in,and,And,2000 +# +in,or,Or,1900 diff --git a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala index 9847abd..299fe02 100644 --- a/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala +++ b/src/main/java/edu/berkeley/nlp/entity/ConllDocReader.scala @@ -296,9 +296,18 @@ object ConllDocReader { var docCounter = 0; var fileIdx = 0; while (fileIdx < files.size && (size == -1 || docCounter < size)) { - val newDocs = reader.readConllDocs(files(fileIdx).getAbsolutePath); - docs ++= newDocs; - docCounter += newDocs.size + val pp = files(fileIdx).getAbsolutePath + try { + Logger.logss("Loading doc: " + pp) + val newDocs = reader.readConllDocs(pp); + docs ++= newDocs; + docCounter += newDocs.size + } catch { + case e : Exception => { + Logger.logss("failed document "+pp) + e.printStackTrace(System.err) + } + } fileIdx += 1; } val numDocs = if (size == -1) docs.size else Math.min(size, files.size); diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala index 8865ae4..72f5e05 100644 --- a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala +++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala @@ -15,43 +15,8 @@ import scala.xml._ */ class WikiDocReader (val lang : Language, val betterParsesFile : String = "") { - val betterParses = new HashMap[ArrayBuffer[String], Tree[String]] - - // TODO: betterParsesFile - - val headFinder = lang match { - case Language.ENGLISH => new ModCollinsHeadFinder() - case _ => throw new RuntimeException() - } - - val sentenceSplitter = SentenceSplitter.loadSentenceSplitter("models/sentsplit.txt.gz") - def readWikiDocs(fileName : String) : Seq[WikiDoc] = { - val referencesFile = fileName.replace("RawTexts", "Problems"); - val refxml = XML.loadFile(referencesFile); - val document = scala.io.Source.fromFile(fileName).mkString - - //val splits = sentenceSplitter.formCanonicalizedParagraphs(document.split(" "), false, false) - val splits = sentenceSplitter.splitSentences(document.split("\n").filter(!_.trim.isEmpty)) - - - - for(reference <- refxml \ "ReferenceInstance") { - val surfaceForm = (reference \ "SurfaceForm")(0).text.trim - val offset = (reference \ "Offset")(0).text.trim.toInt - val length = (reference \ "Length")(0).text.trim.toInt - val chosenAnnotation = (reference \ "ChosenAnnotation")(0).text.trim - val annotatorId = (reference \ "AnnotatorId")(0).text.trim - val annotation = (reference \ "Annotation")(0).text.trim - - - } - // docID some unique identifier, filename - // partNo some int cnt - // words an array of sentences - // trees set of parse trees for a given sentence entity.DepConstTree - // nerchunks entity.Chunk Seq[WikiDoc]() diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala index d872f7f..3b501c1 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -91,7 +91,7 @@ object WikiPreprocessor { parseBits(j) + "\t" + "\t-\t-\t-\t" + "-\t" + // speakers - "-\t" + // nerbit + "*\t" + // nerbit corefBits(j) + "\t" // coref bits ) } @@ -135,23 +135,6 @@ object WikiPreprocessor { ret.toSeq } -// def computeCorefBits[T](cr : Seq[Chunk[T]]) : Array[String] = { -// var ret = new Array[String](cr.size) -// for(i <- 0 until cr.size) { -// var sb = new StringBuilder -// for(c <- cr) { -// -// if(c.start == i) { -// sb.append("(") -// sb.append(c.label) -// } -// if(c.end == i + 1) -// sb.append(")") -// -// } -// } -// ret -// } def mkWikiDoc(inputFile : String, docReader : WikiDocReader, @@ -159,48 +142,6 @@ object WikiPreprocessor { parser : CoarseToFineMaxRuleParser, backoffParser : CoarseToFineMaxRuleParser, nerSystem : NerSystemLabeled) : WikiDoc = { - /*String docName = inputPath; - String[] lines = IOUtils.readLinesHard(inputPath).toArray(new String[0]); - String[] canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(lines, respectInputLineBreaks, respectInputTwoLineBreaks); - String[] sentences = null; - if (skipSentenceSplitting) { - sentences = canonicalizedParagraphs; - } else { - sentences = splitter.splitSentences(canonicalizedParagraphs); - } - String[][] tokenizedSentences = (useAlternateTokenizer ? splitter.tokenizeAlternate(sentences) : splitter.tokenize(sentences)); - Logger.logss("Document " + docName + " contains " + lines.length + " lines and " + tokenizedSentences.length + " sentences"); - String[][] docConllLines = renderDocConllLines(docName, tokenizedSentences, parser, backoffParser, nerSystem); - writeConllLines(docName, docConllLines, outputPath); -*/ - - /* - String[][] conllLines = new String[tokenizedSentences.length][]; - for (int sentIdx = 0; sentIdx < tokenizedSentences.length; sentIdx++) { - String[] tokenizedSentence = tokenizedSentences[sentIdx]; - Tree parse = parse(parser, backoffParser, Arrays.asList(tokenizedSentence)); - if (parse.getYield().size() != tokenizedSentence.length) { - Logger.logss("WARNING: couldn't parse sentence, dropping it: " + Arrays.toString(tokenizedSentence)); - Logger.logss(" (This will be fixed to backing off to an X-bar grammar in a future release)"); - } else { - String[] posTags = new String[tokenizedSentence.length]; - List preterminals = parse.getPreTerminalYield(); - for (int i = 0; i < preterminals.size(); i++) { - posTags[i] = preterminals.get(i); - } - String[] nerBioLabels = null; - if (nerSystem != null) { - nerBioLabels = nerSystem.tagBIO(tokenizedSentence, posTags); - } else { - nerBioLabels = new String[tokenizedSentence.length]; - Arrays.fill(nerBioLabels, "O"); - } - conllLines[sentIdx] = renderSentenceConllLines(docName, 0, tokenizedSentence, posTags, parse, nerBioLabels); - } - } - return conllLines; - - */ Logger.logss("starting processing of " + inputFile) val referencesFile = inputFile.replace("RawTexts", "Problems") diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala index 69fd469..00f4c26 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala @@ -26,7 +26,7 @@ import edu.berkeley.nlp.entity.wiki._ * java -cp /path/to/jar -Xmx8g edu.berkeley.nlp.entity.wiki.WikipediaInterface \ * -datasetPaths path/to/test-docs-directory-one-doc-per-file,path/to/additional/docs,... \ * -wikipediaDumpPath path/to/enwiki-latest-pages-articles.xml - * -outputDir path/to/output-file.ser.gz + * -outputPath path/to/output-file.ser.gz * * Required arguments: * -datasetPaths: pointer to CoNLL-formatted files whose mentions we should extract From 419e35c0e75ada31a5ea3e49b50d85a10348357b Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Tue, 3 Mar 2015 12:03:50 -0800 Subject: [PATCH 09/25] changes to simply seralize the wiki documents --- .../berkeley/nlp/entity/DepConstTree.scala | 3 +- .../edu/berkeley/nlp/entity/WikiDoc.scala | 1 + .../berkeley/nlp/entity/WikiDocReader.scala | 36 +++++-------------- .../entity/preprocess/WikiPreprocessor.scala | 13 ++++--- 4 files changed, 21 insertions(+), 32 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala b/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala index 641cd4c..31a0d06 100644 --- a/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala +++ b/src/main/java/edu/berkeley/nlp/entity/DepConstTree.scala @@ -16,10 +16,11 @@ import java.util.Collections import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder import edu.berkeley.nlp.futile.ling.CollinsHeadFinder +@SerialVersionUID(1L) class DepConstTree(val constTree: Tree[String], val pos: Seq[String], val words: Seq[String], - val childParentDepMap: HashMap[Int,Int]) { + val childParentDepMap: HashMap[Int,Int]) extends Serializable { require(childParentDepMap.keys.toSeq.sorted.sameElements((0 until words.size)), PennTreeRenderer.render(constTree)); def size = words.size; diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala index 343703b..fc1ab62 100644 --- a/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala +++ b/src/main/java/edu/berkeley/nlp/entity/WikiDoc.scala @@ -3,6 +3,7 @@ package edu.berkeley.nlp.entity /** * Created by matthew on 2/18/15. */ +@SerialVersionUID(1L) case class WikiDoc (docID : String, docPartNo : Int, words : Seq[Seq[String]], diff --git a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala index 72f5e05..2c2f6d8 100644 --- a/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala +++ b/src/main/java/edu/berkeley/nlp/entity/WikiDocReader.scala @@ -2,41 +2,23 @@ package edu.berkeley.nlp.entity import java.io.File -import edu.berkeley.nlp.entity.lang.{ModCollinsHeadFinder, Language} -import edu.berkeley.nlp.entity.preprocess.SentenceSplitter -import edu.berkeley.nlp.futile.syntax.Tree +import edu.berkeley.nlp.entity.lang.Language -import scala.collection.immutable.HashMap import scala.collection.mutable.ArrayBuffer -import scala.xml._ /** * Created by matthew on 2/18/15. */ -class WikiDocReader (val lang : Language, val betterParsesFile : String = "") { - - def readWikiDocs(fileName : String) : Seq[WikiDoc] = { - - - - Seq[WikiDoc]() - } - -} +class WikiDocReader (lang : Language, better : String) {} // TODO: remove object WikiDocReader { def loadRawWikiDocs(path : String, size : Int, suffix : String, lang : Language = Language.ENGLISH, betterParsesFile : String = "") : Seq[Document] = { - val rawDir = new File(path) - if (!rawDir.exists() || !rawDir.canRead() || rawDir.listFiles == null || rawDir.listFiles.isEmpty) { - throw new RuntimeException("Couldn't find directory " + path); - } - var rawFiles = rawDir.listFiles.map(_.getAbsolutePath()) - //val files = rawFiles.filter(file => file.getAbsolutePath.endsWith(suffix)); - val reader = new WikiDocReader(lang, betterParsesFile) - val docs = new ArrayBuffer[Document] - for(fname <- rawFiles) { - docs ++= reader.readWikiDocs(fname) - } - docs + + var docs = GUtil.load(path).asInstanceOf[List[WikiDoc]] + + if(size != -1 && docs.size > size) + docs.map(_.asInstanceOf[Document]).slice(0, size).toSeq + else + docs.map(_.asInstanceOf[Document]).toSeq } } \ No newline at end of file diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala index 3b501c1..362d4e1 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -4,7 +4,7 @@ import java.io.File import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser import edu.berkeley.nlp.entity.lang.ModCollinsHeadFinder -import edu.berkeley.nlp.entity.{DepConstTree, WikiDoc, Chunk, WikiDocReader} +import edu.berkeley.nlp.entity._ import edu.berkeley.nlp.entity.ner.NerSystemLabeled import edu.berkeley.nlp.futile.util.Logger import edu.berkeley.nlp.futile.syntax.Tree @@ -31,7 +31,7 @@ object WikiPreprocessor { parser : CoarseToFineMaxRuleParser, backoffParser : CoarseToFineMaxRuleParser, nerSystem : NerSystemLabeled) = { - new File(inputDir).listFiles.map(file => { + val wikiDocs = new File(inputDir).listFiles.map(file => { val input_file = file.getAbsolutePath val output_file = outputDir + file.getName Future { @@ -46,7 +46,11 @@ object WikiPreprocessor { } } } - }).foreach(Await.result(_, duration.Duration.Inf)) + }).map(f => { + Await.result(f, duration.Duration.Inf) + f.value.get.get + }).filter(_ != null).toList + GUtil.save(wikiDocs.asInstanceOf[Serializable], outputDir + "wiki-docs.doc.ser.gz") } def process(inputFile : String, outputFile : String, @@ -54,13 +58,14 @@ object WikiPreprocessor { splitter : SentenceSplitter, parser : CoarseToFineMaxRuleParser, backoffParser : CoarseToFineMaxRuleParser, - nerSystem : NerSystemLabeled) = { + nerSystem : NerSystemLabeled) : WikiDoc = { val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem) val lines = wikiToConllLines(wdoc) val wlines = wikiToWikiLines(wdoc) //PreprocessingDriver.writeConllLines(wdoc.docID, lines.map(_.toArray).toArray, outputFile) writeWikiLines(wdoc.docID, lines, outputFile) writeWikiLines(wdoc.docID, wlines, outputFile.replace("raw", "wiki")) + wdoc } def writeWikiLines(docID : String, lines : Seq[Seq[String]], outputFile : String) = { From e5b69f1644530ae337f0993f09d3be7c5f8014d1 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sat, 7 Mar 2015 20:40:33 -0800 Subject: [PATCH 10/25] hopefully fix some bugs --- .../entity/preprocess/WikiPreprocessor.scala | 92 +++++++++++-------- .../wiki/JointQueryDenotationChooser.scala | 24 +++-- .../nlp/entity/wiki/WikipediaInterface.scala | 33 +++++-- .../nlp/entity/wiki/WikipediaLinkDB.scala | 2 +- 4 files changed, 98 insertions(+), 53 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala index 362d4e1..a63f48b 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -31,24 +31,19 @@ object WikiPreprocessor { parser : CoarseToFineMaxRuleParser, backoffParser : CoarseToFineMaxRuleParser, nerSystem : NerSystemLabeled) = { - val wikiDocs = new File(inputDir).listFiles.map(file => { + val wikiDocs = new File(inputDir).listFiles/*.par*/.map(file => { val input_file = file.getAbsolutePath val output_file = outputDir + file.getName - Future { - try { - process(input_file, output_file, docReader, splitter, parser.newInstance, backoffParser.newInstance, nerSystem) - } catch { - case e : Exception => { - Logger.logss("failed file: "+input_file) - System.err.print(e.toString) - e.printStackTrace(System.err) - null - } + try { + process(input_file, output_file, docReader, splitter, parser.newInstance, backoffParser.newInstance, nerSystem) + } catch { + case e : Exception => { + Logger.logss("failed file: "+input_file) + System.err.print(e.toString) + e.printStackTrace(System.err) + null } } - }).map(f => { - Await.result(f, duration.Duration.Inf) - f.value.get.get }).filter(_ != null).toList GUtil.save(wikiDocs.asInstanceOf[Serializable], outputDir + "wiki-docs.doc.ser.gz") } @@ -61,6 +56,7 @@ object WikiPreprocessor { nerSystem : NerSystemLabeled) : WikiDoc = { val wdoc = mkWikiDoc(inputFile, docReader, splitter, parser, backoffParser, nerSystem) val lines = wikiToConllLines(wdoc) + //val wlines = wiki.WikiAnnotReaderWriter.getWikiBits(wdoc.words.map(_.size), wdoc.wikiRefChunks) val wlines = wikiToWikiLines(wdoc) //PreprocessingDriver.writeConllLines(wdoc.docID, lines.map(_.toArray).toArray, outputFile) writeWikiLines(wdoc.docID, lines, outputFile) @@ -118,7 +114,8 @@ object WikiPreprocessor { ret.map(i => {if(i.isEmpty) "-" else i.reduce(_+"|"+_)}) } - def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = { + /*def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = { + // this does not handle multiple chunks on the same span well, but that shouldn't be an issue, since wiki docs shouldn't have that val ret = ListBuffer[Seq[String]]() for(i <- 0 until wdoc.numSents) { val lines = new ListBuffer[String]() @@ -138,6 +135,21 @@ object WikiPreprocessor { ret.append(lines.toSeq) } ret.toSeq + }*/ + + def wikiToWikiLines(wdoc : WikiDoc) : Seq[Seq[String]] = { + for (sentIdx <- 0 until wdoc.words.size) yield { + for (tokenIdx <- 0 until wdoc.words(sentIdx).size) yield { + val chunksStartingHere = wdoc.wikiRefChunks(sentIdx).filter(chunk => chunk.start == tokenIdx).sortBy(- _.end); + val numChunksEndingHere = wdoc.wikiRefChunks(sentIdx).filter(chunk => chunk.end - 1 == tokenIdx).size; + var str = if(chunksStartingHere.isEmpty) "" else { + chunksStartingHere.map("("+_.label.replace("(", "-LRB-").replace(")", "-RRB-").replace("*", "-STAR-")).reduce(_+"|"+_) + } + str += "*"; + str += ")" * numChunksEndingHere + str; + } + } } @@ -174,6 +186,10 @@ object WikiPreprocessor { val d = doclenratio * (ref._2 + ref._3 / 2.0) var cnt = 0 val wrds = ref._1.replace(" ", "") + + if(wrds.isEmpty) // wtf, how does not create an empty citation??? + return (-1, null) + def rank_match(i : Int, j : Int) : Double = { val res = tokens(i).drop(j).reduce(_+_) for(q <- 0 until Math.min(wrds.size, res.size)) { @@ -182,33 +198,31 @@ object WikiPreprocessor { } 1.0 } - for(i <- 0 to sentences.size) { - cnt += sentences(i).size - if(cnt > d) { - // assume that the reference is in this sentence - var ll = cnt - sentences(i).size + d // estimated place in sentence - var tcnt = 0 - var best_start = 0 - var best_rank = Double.NegativeInfinity - - for(j <- 0 until tokens(i).size) { - val r = rank_match(i,j) / Math.abs(ll - tcnt) // try and make the item close to where it should be - if(r > best_rank) { - best_start = j - best_rank = r - } - tcnt += tokens(i)(j).size + var best_start = 0 + var best_rank = Double.NegativeInfinity + var best_sentence = 0 + for(i <- 0 until sentences.size) { + var tcnt = 0 + for(j <- 0 until tokens(i).size) { + val r = rank_match(i, j) / Math.log(Math.abs(d - cnt - tcnt) + 2) // little to simple, but works in most cases + if(r > best_rank) { + best_rank = r + best_start = j + best_sentence = i } - var len = 0 - var len_cnt = 0 - for(j <- best_start until tokens(i).size; if len_cnt < wrds.size) { - len_cnt += tokens(i)(j).size - len += 1 - } - return (i, new Chunk(best_start, best_start + len, ref._4)) + tcnt += tokens(i)(j).size + 1 // +1 to match the space } + cnt += sentences(i).size + } + var len = 0 + var len_cnt = 0 + for(j <- best_start until tokens(best_sentence).size; if len_cnt < wrds.size) { + len_cnt += tokens(best_sentence)(j).size + len += 1 } - (-1, null) + if(len == 0) + return (-1, null) + (best_sentence, new Chunk(best_start, best_start + len, ref._4)) } val refplaces = references.map(refFinder) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index 4d03771..9da4d77 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -3,16 +3,14 @@ package edu.berkeley.nlp.entity.wiki import edu.berkeley.nlp.entity.lang.Language import edu.berkeley.nlp.futile.LightRunner import edu.berkeley.nlp.entity.coref.CorefDocAssembler -import edu.berkeley.nlp.entity.ConllDocReader +import edu.berkeley.nlp.entity._ import edu.berkeley.nlp.entity.coref.MentionPropertyComputer -import edu.berkeley.nlp.entity.GUtil import edu.berkeley.nlp.futile.fig.basic.Indexer import edu.berkeley.nlp.entity.joint.LikelihoodAndGradientComputer import scala.collection.mutable.ArrayBuffer import edu.berkeley.nlp.entity.coref.CorefDoc import edu.berkeley.nlp.futile.math.SloppyMath import edu.berkeley.nlp.futile.util.Logger -import edu.berkeley.nlp.entity.Chunk import edu.berkeley.nlp.entity.joint.GeneralTrainer /** @@ -185,7 +183,7 @@ object JointQueryDenotationChooser { val trainDataPath = "data/ace05/train"; val testDataPath = "data/ace05/dev"; - val wikiPath = "data/ace05/ace05-all-conll-wiki" + val wikiPath = "data/ace05/ace05-all-conll-wiki" // contains the wiki links for both items val wikiDBPath = "models/wiki-db-ace.ser.gz" val lambda = 1e-8F @@ -199,8 +197,22 @@ object JointQueryDenotationChooser { LightRunner.populateScala(JointQueryDenotationChooser.getClass(), args) // Read in CoNLL documents val assembler = CorefDocAssembler(Language.ENGLISH, true); - val trainDocs = ConllDocReader.loadRawConllDocsWithSuffix(trainDataPath, -1, "", Language.ENGLISH); - val trainCorefDocs = trainDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None))); + val trainDocs = if(trainDataPath.startsWith("wikiser:")) { + WikiDocReader.loadRawWikiDocs(trainDataPath.split(":")(1), -1, "", Language.ENGLISH) + } else { + ConllDocReader.loadRawConllDocsWithSuffix(trainDataPath, -1, "", Language.ENGLISH) + }; + val trainCorefDocs = trainDocs.map(doc => { + try { + assembler.createCorefDoc(doc, new MentionPropertyComputer(None)) + } catch { + case e : Exception => { + // TODO: fix the wikidocument parser + println("failed document "+doc.docID) + null + } + } + }).filter(_!=null); // Read in gold Wikification labels val goldWikification = WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiPath) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala index 00f4c26..ebeb0d3 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala @@ -210,22 +210,41 @@ object WikipediaInterface { val mentionPropertyComputer = new MentionPropertyComputer(None); val pmAssembler = CorefDocAssembler(Language.ENGLISH, useGoldMentions = false); val gmAssembler = CorefDocAssembler(Language.ENGLISH, useGoldMentions = true); - val corefDocs = WikipediaInterface.datasetPaths.split(",").flatMap(path => { - if (WikipediaInterface.mentionType == "old") { + val corefDocs = WikipediaInterface.datasetPaths.split(",").flatMap(path_ => { + var path = path_ + val mentionType = if(path.contains(":")) { + val s = path.split(":") + path = s(1) + s(0) + } else { + WikipediaInterface.mentionType + } + Logger.logss("Loading documents "+mentionType+" "+path) + if (mentionType == "old") { // Wikification dataset: use only auto_conll and pred mentions ConllDocReader.loadRawConllDocsWithSuffix(path, -1, "", Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer)); - } else if (WikipediaInterface.mentionType == "ace") { + } else if (mentionType == "ace") { // ACE: Use gold mentions here ConllDocReader.loadRawConllDocsWithSuffix(path, -1, "", Language.ENGLISH).map(doc => gmAssembler.createCorefDoc(doc, mentionPropertyComputer)); - } else if (WikipediaInterface.mentionType == "ontonotes") { + } else if (mentionType == "ontonotes") { // OntoNotes: use only auto_conll and pred mentions ConllDocReader.loadRawConllDocsWithSuffix(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer)); - } else if (WikipediaInterface.mentionType == "wikipedia") { - WikiDocReader.loadRawWikiDocs(path, -1, docSuffix, Language.ENGLISH).map(doc => pmAssembler.createCorefDoc(doc, mentionPropertyComputer)) + } else if (mentionType == "wikiser") { + WikiDocReader.loadRawWikiDocs(path, -1, docSuffix, Language.ENGLISH).map(doc => { + try { + gmAssembler.createCorefDoc(doc, mentionPropertyComputer) + } catch { + case e : Exception => { + // there are currently about 30 documents that are having an issue with their references + println("FAIL DOCUMENT: "+doc.docID) + null + } + } + }) } else { throw new RuntimeException("Unrecognized mention type: " + WikipediaInterface.mentionType); } - }); + }).filter(_!=null); // val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet; // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents. diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala index cdcb894..f2f1f6a 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala @@ -124,7 +124,7 @@ object WikipediaLinkDB { } } } - val inLinksMapArrs = inLinksMap.map(entry => entry._1 -> entry._2.toArray); + val inLinksMapArrs = inLinksMap.map(entry => entry._1 -> entry._2.toArray); // TODO: WTF: inlinksmap is never written to val outLinksMapArrs = outLinksMap.map(entry => entry._1 -> entry._2.toArray); val sizes = Array.tabulate(10)(i => 0); for (key <- outLinksMapArrs.keySet) { From 5535e73811cdf525ebc1bb8c97850e9681b371ae Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Thu, 12 Mar 2015 19:33:28 -0700 Subject: [PATCH 11/25] adding some comments --- .../java/edu/berkeley/nlp/entity/GUtil.scala | 2 +- .../entity/preprocess/WikiPreprocessor.scala | 2 +- .../wiki/JointQueryDenotationChooser.scala | 68 ++++++++++++++----- .../entity/wiki/WikificationEvaluator.scala | 27 ++++++++ 4 files changed, 81 insertions(+), 18 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/GUtil.scala b/src/main/java/edu/berkeley/nlp/entity/GUtil.scala index 803cd6d..8031560 100644 --- a/src/main/java/edu/berkeley/nlp/entity/GUtil.scala +++ b/src/main/java/edu/berkeley/nlp/entity/GUtil.scala @@ -406,7 +406,7 @@ object GUtil { def argMaxIdxFloat(values: Seq[Float]) = { var currIdx = 0; var maxIdx = 0; - var maxVal = Double.NegativeInfinity; + var maxVal = Float.NegativeInfinity; while (currIdx < values.size) { if (values(currIdx) > maxVal) { maxIdx = currIdx; diff --git a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala index a63f48b..21af271 100644 --- a/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala +++ b/src/main/java/edu/berkeley/nlp/entity/preprocess/WikiPreprocessor.scala @@ -31,7 +31,7 @@ object WikiPreprocessor { parser : CoarseToFineMaxRuleParser, backoffParser : CoarseToFineMaxRuleParser, nerSystem : NerSystemLabeled) = { - val wikiDocs = new File(inputDir).listFiles/*.par*/.map(file => { + val wikiDocs = new File(inputDir).listFiles.par.map(file => { val input_file = file.getAbsolutePath val output_file = outputDir + file.getName try { diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index 9da4d77..23c025a 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -53,13 +53,17 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface, */ def getUnnormalizedJointScores(ex: JointQueryDenotationExample, weights: Array[Float]): Array[Array[Float]] = { featurizeUseCache(ex, false) + // each example will have a number of features associated with each query + // each feature is an indicator, so we use the cache of the features indexes + // and sum the values of the features val rawQueryScores = ex.cachedFeatsEachQuery.map(feats => GUtil.scoreIndexedFeats(feats, weights)); + // these are the weights from each query wrt the various word choices val queryDenotationMatrix = ex.cachedFeatsEachQueryDenotation.map(_.map(feats => GUtil.scoreIndexedFeats(feats, weights))); val scores = Array.tabulate(ex.queries.size, ex.allDenotations.size)((i, j) => Float.NegativeInfinity) - for (queryIdx <- 0 until ex.queries.size) { - for (denotationIdx <- 0 until ex.allDenotations.size) { - scores(queryIdx)(denotationIdx) = rawQueryScores(queryIdx) + queryDenotationMatrix(queryIdx)(denotationIdx) - } + for (queryIdx <- 0 until ex.queries.size; denotationIdx <- 0 until ex.allDenotations.size) { + // These are indicator weights, so by summing them we can compute the resulting value of choosing a given word + // and a given query by combining the results of the dot product of the query and the denotation + scores(queryIdx)(denotationIdx) = rawQueryScores(queryIdx) + queryDenotationMatrix(queryIdx)(denotationIdx) } scores } @@ -70,7 +74,9 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface, */ def getDenotationLogMarginals(ex: JointQueryDenotationExample, weights: Array[Float]): Array[Float] = { val scores = getUnnormalizedJointScores(ex, weights) - // Sum up each column + // the scores matrix contains log(p_{i,j}), so we are using + // logAdd to sum the probabilities + // as p(q,d) \propto e^(w^T f(q,d)) val rawDenotationMarginals = Array.tabulate(ex.allDenotations.size)(i => SloppyMath.logAdd(scores.map(_(i))).toFloat) val normalizer = SloppyMath.logAdd(rawDenotationMarginals).toFloat (0 until rawDenotationMarginals.size).foreach(i => rawDenotationMarginals(i) -= normalizer) @@ -136,6 +142,15 @@ class JointQueryDenotationChooser(val featureIndexer: Indexer[String], val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]()); computer.computeDenotation(ex, weights) } + + def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : Seq[String] = { + val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer); + val denotations = queries.map(query => wikiDB.disambiguateBestNoDisambig(query)); + val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]()); + val denotationMarginals = computer.getDenotationLogMarginals(ex, weights) + + ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).map(_._1) + } } object JointQueryDenotationChooser { @@ -180,6 +195,16 @@ object JointQueryDenotationChooser { exs; } + + def loadDocuments(path : String) = { + val limit = 500 // -1 + if(path.startsWith("wikiser:")) { + WikiDocReader.loadRawWikiDocs(path.split(":")(1), limit, "", Language.ENGLISH) + } else { + ConllDocReader.loadRawConllDocsWithSuffix(path, limit, "", Language.ENGLISH) + } + } + val trainDataPath = "data/ace05/train"; val testDataPath = "data/ace05/dev"; @@ -197,11 +222,7 @@ object JointQueryDenotationChooser { LightRunner.populateScala(JointQueryDenotationChooser.getClass(), args) // Read in CoNLL documents val assembler = CorefDocAssembler(Language.ENGLISH, true); - val trainDocs = if(trainDataPath.startsWith("wikiser:")) { - WikiDocReader.loadRawWikiDocs(trainDataPath.split(":")(1), -1, "", Language.ENGLISH) - } else { - ConllDocReader.loadRawConllDocsWithSuffix(trainDataPath, -1, "", Language.ENGLISH) - }; + val trainDocs = loadDocuments(trainDataPath); val trainCorefDocs = trainDocs.map(doc => { try { assembler.createCorefDoc(doc, new MentionPropertyComputer(None)) @@ -213,7 +234,11 @@ object JointQueryDenotationChooser { } } }).filter(_!=null); - + + //val testDocs = ConllDocReader.loadRawConllDocsWithSuffix(testDataPath, -1, "", Language.ENGLISH); + val testDocs = loadDocuments(testDataPath) + val testCorefDocs = testDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None))); + // Read in gold Wikification labels val goldWikification = WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiPath) // Read in the title given surface database @@ -237,16 +262,27 @@ object JointQueryDenotationChooser { // Build the test examples and decode the test set // No filtering now because we're doing test - val testDocs = ConllDocReader.loadRawConllDocsWithSuffix(testDataPath, -1, "", Language.ENGLISH); - val testCorefDocs = testDocs.map(doc => assembler.createCorefDoc(doc, new MentionPropertyComputer(None))); + val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = false); - val goldTestDenotationsAsTrivialChunks = (0 until testExs.size).map(i => new Chunk[Seq[String]](i, i+1, testExs(i).rawCorrectDenotations)) - val predTestDenotationsAsTrivialChunks = (0 until testExs.size).map(i => new Chunk[String](i, i+1, chooser.pickDenotation(testExs(i).queries, wikiDB))) + + val results = testExs.map(t => { + // TOD: need more then one perdicted title + (t.rawCorrectDenotations, chooser.pickDenotations(t.queries, wikiDB)) + }) + + val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1)) + val predTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[String](i, i+1, results(i)._2(0))) // Hacky but lets us reuse some code that normally evaluates things with variable endpoints // WikificationEvaluator.evaluateWikiChunksBySent(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks)) WikificationEvaluator.evaluateFahrniMetrics(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks), Set()) - + //val outs = new PrintWRiter(System.out) + //WikificationEvaluator.writeWikificationRightAndWrong(outs, outs, ) + + + + + LightRunner.finalizeOutput(); } diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala index cdb1566..9cbf642 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala @@ -78,6 +78,33 @@ object WikificationEvaluator { } Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom)); } + + + // create sets of all the gold document references, and all the documents + // that we generate, and then compute an F1 + + /*def evaluateBOTF1_mfl(results : Seq[(String, Seq[String])]) = { + var correct = 0; + var precDenom = 0; + var recDenom = 0; + for (i <- 0 until results.size) { + + for (title <- allPredTitles(i)) { + var markedCorrect = false; + for (goldTitleSet <- allGoldTitles(i)) { + markedCorrect = markedCorrect || isCorrect(goldTitleSet.toSeq, title); + } + if (markedCorrect) { + correct += 1; + } + } + precDenom += allPredTitles(i).size; + recDenom += allGoldTitles(i).size; + } + Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom)); + }*/ + + def convertChunksToBagOfTitles(titles: Iterable[Seq[Chunk[String]]]): Set[String] = { val bagOfTitles = titles.flatMap(sentTitles => { From b28c1b8498429de1d822f056020fdab82cc3cd6a Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Fri, 13 Mar 2015 12:04:04 -0700 Subject: [PATCH 12/25] buggy f1 scorer --- .../wiki/JointQueryDenotationChooser.scala | 11 +++-- .../entity/wiki/WikificationEvaluator.scala | 42 +++++++++---------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index 23c025a..c3c5623 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -149,7 +149,7 @@ class JointQueryDenotationChooser(val featureIndexer: Indexer[String], val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]()); val denotationMarginals = computer.getDenotationLogMarginals(ex, weights) - ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).map(_._1) + ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse.map(_._1) } } @@ -266,8 +266,8 @@ object JointQueryDenotationChooser { val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = false); val results = testExs.map(t => { - // TOD: need more then one perdicted title - (t.rawCorrectDenotations, chooser.pickDenotations(t.queries, wikiDB)) + // TODO: need more then one perdicted title + (t.rawCorrectDenotations, chooser.pickDenotations(t.queries, wikiDB), t.queries(0).originalMent.rawDoc) }) val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1)) @@ -276,11 +276,10 @@ object JointQueryDenotationChooser { // Hacky but lets us reuse some code that normally evaluates things with variable endpoints // WikificationEvaluator.evaluateWikiChunksBySent(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks)) WikificationEvaluator.evaluateFahrniMetrics(Seq(goldTestDenotationsAsTrivialChunks), Seq(predTestDenotationsAsTrivialChunks), Set()) - //val outs = new PrintWRiter(System.out) - //WikificationEvaluator.writeWikificationRightAndWrong(outs, outs, ) - + val mentionsByDoc = results.groupBy(_._3) + WikificationEvaluator.evaluateBOTF1_mfl(mentionsByDoc) LightRunner.finalizeOutput(); diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala index 9cbf642..ecd411c 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala @@ -1,13 +1,14 @@ package edu.berkeley.nlp.entity.wiki -import edu.berkeley.nlp.entity.Chunk +import edu.berkeley.nlp.entity.{Document, Chunk, GUtil} import edu.berkeley.nlp.futile.util.Logger -import edu.berkeley.nlp.entity.GUtil import edu.berkeley.nlp.futile.util.Counter import scala.collection.JavaConverters._ import edu.berkeley.nlp.entity.joint.JointDocACE import java.io.PrintWriter +import scala.collection.mutable.ArrayBuffer + object WikificationEvaluator { def removeExcludes(chunks: Seq[Chunk[String]]) = chunks.filter(chunk => chunk.label != ExcludeToken) @@ -82,28 +83,27 @@ object WikificationEvaluator { // create sets of all the gold document references, and all the documents // that we generate, and then compute an F1 - - /*def evaluateBOTF1_mfl(results : Seq[(String, Seq[String])]) = { - var correct = 0; - var precDenom = 0; - var recDenom = 0; - for (i <- 0 until results.size) { - - for (title <- allPredTitles(i)) { - var markedCorrect = false; - for (goldTitleSet <- allGoldTitles(i)) { - markedCorrect = markedCorrect || isCorrect(goldTitleSet.toSeq, title); - } - if (markedCorrect) { - correct += 1; + def evaluateBOTF1_mfl(results : Map[Document, Seq[(Seq[String], Seq[String], Document)]]) = { + // f1 = 2 * precision * recall / (percison + recall) + var correct = 0 + var precDenom = 0 + var recDenom = 0 + for((doc, matches) <- results) { + var seenBefore = Set[String]() + for((gold, selected, _) <- matches) { + val goldS = Set(gold:_*) + val selectedS = Set(selected(0)) //Set(selected:_*) + val ints = goldS & selectedS + if(!ints.subsetOf(seenBefore)) { + correct += ints.size + seenBefore ++= ints } } - precDenom += allPredTitles(i).size; - recDenom += allGoldTitles(i).size; + precDenom += Set(matches.flatMap(_._2):_*).size + recDenom += Set(matches.flatMap(_._1):_*).size } - Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom)); - }*/ - + Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom)) + } def convertChunksToBagOfTitles(titles: Iterable[Seq[Chunk[String]]]): Set[String] = { From 8b4bf0128b7c8651b975a8fb55688c84e4113e9b Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sat, 14 Mar 2015 01:00:38 -0700 Subject: [PATCH 13/25] fix f1 metric --- .../entity/wiki/WikificationEvaluator.scala | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala index ecd411c..c4ad61f 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikificationEvaluator.scala @@ -90,17 +90,33 @@ object WikificationEvaluator { var recDenom = 0 for((doc, matches) <- results) { var seenBefore = Set[String]() - for((gold, selected, _) <- matches) { + val allGold = Set(matches.flatMap(_._1):_*) + val allChoosen = Set(matches.map(_._2(0)):_*) //Set(matches.flatMap(_._2):_*) + + /*for((gold, selected, _) <- matches) { val goldS = Set(gold:_*) val selectedS = Set(selected(0)) //Set(selected:_*) val ints = goldS & selectedS - if(!ints.subsetOf(seenBefore)) { + //if(!ints.subsetOf(seenBefore)) { correct += ints.size seenBefore ++= ints - } - } - precDenom += Set(matches.flatMap(_._2):_*).size - recDenom += Set(matches.flatMap(_._1):_*).size + //} + }*/ + // TODO: something wrong with computing the set intersection + + val dprecDenom = allChoosen.size + val drecDenom = allGold.size + var dcorrect = 0 + allChoosen.foreach(c => { + if(isCorrect(allGold.toSeq, c)) + dcorrect += 1 + }) + //val diff = (allGold ++ allChoosen) -- (allGold & allChoosen) + //val dcorrect = (allGold & allChoosen).size + //Logger.logss("Document f1: "+GUtil.renderPRF1(dcorrect, dprecDenom, drecDenom)) + precDenom += dprecDenom + recDenom += drecDenom + correct += dcorrect } Logger.logss("Results (BOT F1): " + GUtil.renderPRF1(correct, precDenom, recDenom)) } From bb12bd1d6fe4ed8e400c9da43118489c800c910c Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Tue, 17 Mar 2015 16:23:25 -0700 Subject: [PATCH 14/25] some bug fixes --- .../wiki/JointQueryDenotationChooser.scala | 52 +++++++++++++++---- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index c3c5623..2a30f00 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -40,7 +40,7 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface, val featureIndexer: Indexer[String]) extends LikelihoodAndGradientComputer[JointQueryDenotationExample] { // Used for feature computation val queryChooser = new QueryChoiceComputer(wikiDB, featureIndexer) - + def featurizeUseCache(ex: JointQueryDenotationExample, addToIndexer: Boolean) { if (ex.cachedFeatsEachQuery == null) { ex.cachedFeatsEachQuery = queryChooser.featurizeQueries(ex.queries, addToIndexer) @@ -136,21 +136,26 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface, class JointQueryDenotationChooser(val featureIndexer: Indexer[String], val weights: Array[Float]) extends Serializable { - def pickDenotation(queries: Seq[Query], wikiDB: WikipediaInterface): String = { + /*def pickDenotation(queries: Seq[Query], wikiDB: WikipediaInterface): String = { val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer); - val denotations = queries.map(query => wikiDB.disambiguateBestNoDisambig(query)); + val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query)); val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]()); computer.computeDenotation(ex, weights) - } + }*/ def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : Seq[String] = { val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer); - val denotations = queries.map(query => wikiDB.disambiguateBestNoDisambig(query)); - val ex = new JointQueryDenotationExample(queries, denotations, Array[String](), Array[String]()); + val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query)); + val dden = Query.extractDenotationSetWithNil(queries, denotations, 10) + val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]()); val denotationMarginals = computer.getDenotationLogMarginals(ex, weights) ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse.map(_._1) } + + def diffFeatures(correct: Query, choosen: Query, wikiDB: WikipediaInterface) = { + + } } object JointQueryDenotationChooser { @@ -197,7 +202,7 @@ object JointQueryDenotationChooser { def loadDocuments(path : String) = { - val limit = 500 // -1 + val limit = numLoadedSamples//500 if(path.startsWith("wikiser:")) { WikiDocReader.loadRawWikiDocs(path.split(":")(1), limit, "", Language.ENGLISH) } else { @@ -216,6 +221,8 @@ object JointQueryDenotationChooser { val numItrs = 20 val maxNumWikificationOptions = 7 + + val numLoadedSamples = -1 // for debugging by loading less samples def main(args: Array[String]) { LightRunner.initializeOutput(JointQueryDenotationChooser.getClass()); @@ -263,11 +270,37 @@ object JointQueryDenotationChooser { // Build the test examples and decode the test set // No filtering now because we're doing test - val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = false); + val testExs = extractExamples(testCorefDocs, goldWikification, wikiDB, filterImpossible = true)//false); + + var correctItemWasInSet = 0 val results = testExs.map(t => { // TODO: need more then one perdicted title - (t.rawCorrectDenotations, chooser.pickDenotations(t.queries, wikiDB), t.queries(0).originalMent.rawDoc) + val picks = chooser.pickDenotations(t.queries, wikiDB) + if(!isCorrect(t.rawCorrectDenotations, picks(0))) { + // the pick is not correct, attempt to determine if there would have + // been a better pick that is in the picks list (which basically means all of the + /*if(picks.size > 1 && isCorrect(t.rawCorrectDenotations, picks(1))) { + // the correct pick was the second answer instead of the first one + // try and report the differences between the two items + println("second pick was correct") + + }*/ + var qq = false + for((p, i) <- picks.drop(1).zipWithIndex) { + // try: t.correctDenotations here? + if(isCorrect(t.correctDenotations, p) || isCorrect(t.rawCorrectDenotations, p)) { + println("Found correct item with "+i) + correctItemWasInSet += 1 + qq = true + //println("found correct item") + } + } + if(!qq) { + println("???") + } + } + (t.rawCorrectDenotations, picks, t.queries(0).originalMent.rawDoc) }) val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1)) @@ -280,6 +313,7 @@ object JointQueryDenotationChooser { val mentionsByDoc = results.groupBy(_._3) WikificationEvaluator.evaluateBOTF1_mfl(mentionsByDoc) + println("Number of correct items that were in the set: "+correctItemWasInSet) LightRunner.finalizeOutput(); From 8c77dfbd8f66f1f8f0993a61178e6f4ea620039f Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Wed, 25 Mar 2015 17:01:07 -0700 Subject: [PATCH 15/25] make the gold follow the redirect db as they currently reference old pages --- .../java/edu/berkeley/nlp/entity/coref/Mention.scala | 11 +++++++++++ .../coref/PairwiseIndexingFeaturizerJoint.scala | 2 ++ .../nlp/entity/wiki/JointQueryDenotationChooser.scala | 9 +++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala index 8069292..c31144a 100644 --- a/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala +++ b/src/main/java/edu/berkeley/nlp/entity/coref/Mention.scala @@ -15,6 +15,7 @@ import edu.berkeley.nlp.entity.WordNetInterfacer // TODO: Extract an interface for ConllDoc so I don't have to keep the whole // document around...but while I'm feature engineering it's useful to be able // to put my hands on anything I want +// ... ok settle down class Mention(val rawDoc: Document, val mentIdx: Int, val sentIdx: Int, @@ -39,6 +40,16 @@ class Mention(val rawDoc: Document, var cachedNerPossibilities: Option[Chunk[Counter[String]]] = None; var cachedNerGold: Option[Chunk[String]] = None; + override def toString = { + var ret = "{" + if(startIdx > 1) + ret += rawDoc.words(sentIdx)(startIdx - 1) + " " + ret += "["+spanToString+"]" + if(endIdx < rawDoc.words(sentIdx).size-1) + ret += rawDoc.words(sentIdx)(endIdx+ 1) + ret + "}" + } + def speaker = rawDoc.speakers(sentIdx)(headIdx); def headString = rawDoc.words(sentIdx)(headIdx); diff --git a/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala b/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala index 31c32f6..21b1ac7 100644 --- a/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala +++ b/src/main/java/edu/berkeley/nlp/entity/coref/PairwiseIndexingFeaturizerJoint.scala @@ -21,6 +21,8 @@ import edu.berkeley.nlp.entity.WordNetInterfacer * DO NOT try to add WordNetInterfacer here! It is not serializable and so * everything will explode when we try to serialize the model. So we choose * to cache it on the documents even though this is pretty hacky. + * + * TODO: maybe change to using "transient" fields re:^^ */ @SerialVersionUID(1L) class PairwiseIndexingFeaturizerJoint(val featureIndexer: Indexer[String], diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index 2a30f00..0717bd7 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -146,7 +146,7 @@ class JointQueryDenotationChooser(val featureIndexer: Indexer[String], def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : Seq[String] = { val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer); val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query)); - val dden = Query.extractDenotationSetWithNil(queries, denotations, 10) + val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions) val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]()); val denotationMarginals = computer.getDenotationLogMarginals(ex, weights) @@ -177,7 +177,8 @@ object JointQueryDenotationChooser { // There are multiple possible gold Wikipedia titles for some mentions. Note that // NIL (no entry in Wikipedia) is included as an explicit choice, so this includes NILs (as // it should according to how the task is defined) - val goldLabel = getGoldWikification(goldWikification(docName), ment) + val goldLabelp = getGoldWikification(goldWikification(docName), ment) + val goldLabel = (goldLabelp ++ goldLabelp.map(wikiDB.redirectsDB.followRedirect(_))).distinct if (goldLabel.size >= 1) { val queries = Query.extractQueriesBest(ment, true); val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_)); @@ -189,6 +190,10 @@ object JointQueryDenotationChooser { // if (correctIndices.isEmpty && if (filterImpossible && correctIndices.isEmpty) { numImpossible += 1; + println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations) + if(goldLabel.contains("Lord_Speaker")) { + println("wtfwtf") + } } else { exs += new JointQueryDenotationExample(queries, denotations, correctDenotations, goldLabel) } From 1ff173dde61eab366482099b1d529238c44cdaa7 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Thu, 26 Mar 2015 23:55:03 -0700 Subject: [PATCH 16/25] attempt at adding more queries to find the matching page title --- .../wiki/JointQueryDenotationChooser.scala | 6 +- .../edu/berkeley/nlp/entity/wiki/Query.scala | 58 +++++++++++++++++-- .../nlp/entity/wiki/QueryChooser.scala | 2 + .../nlp/entity/wiki/WikipediaInterface.scala | 20 +++++-- 4 files changed, 76 insertions(+), 10 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index 0717bd7..43648c5 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -180,7 +180,11 @@ object JointQueryDenotationChooser { val goldLabelp = getGoldWikification(goldWikification(docName), ment) val goldLabel = (goldLabelp ++ goldLabelp.map(wikiDB.redirectsDB.followRedirect(_))).distinct if (goldLabel.size >= 1) { + //val oldqueries = Query.extractQueriesBest_old(ment, true); val queries = Query.extractQueriesBest(ment, true); + /*if(!(Set(oldqueries.map(_.getFinalQueryStr):_*) subsetOf Set(queries.map(_.getFinalQueryStr):_*))) { + println("failed") + }*/ val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_)); // val denotations = queries.map(wikiDB.disambiguateBestNoDisambig(_)); val denotations = Query.extractDenotationSetWithNil(queries, queryDisambigs, maxNumWikificationOptions); @@ -225,7 +229,7 @@ object JointQueryDenotationChooser { val batchSize = 1 val numItrs = 20 - val maxNumWikificationOptions = 7 + val maxNumWikificationOptions = 20 //7 val numLoadedSamples = -1 // for debugging by loading less samples diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala index ce86957..e7f6c56 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala @@ -16,7 +16,8 @@ case class Query(val words: Seq[String], val originalMent: Mention, val finalSpan: (Int, Int), val queryType: String, - val removePuncFromQuery: Boolean = true) { + val removePuncFromQuery: Boolean = true, + val features: List[String] = List[String]()) { def getFinalQueryStr = { val wordsNoPunc = if (removePuncFromQuery) { @@ -40,9 +41,16 @@ object Query { val PluralQueryExpand = true; val RemovePuncFromQuery = true; val UseFirstHead = true; - val MaxQueryLen = 4; - val BlackList = Set("the", "a", "my", "your", "his", "her", "our", "their", "its", "this", "that", "these", "those") - val PuncList = Set(',', '.', '!', '?', ':', ';', '\'', '"', '(', ')', '[', ']', '{', '}', ' '); + val MaxQueryLen = 8; + val BlackList = Set( + "the", "a", "my", "your", "his", "her", "our", + "their", "its", "this", "that", "these", "those", + "of" + ) + val PuncList = Set( + ',', '.', '!', '?', ':', ';', '\'', '"', '(', ')', + '[', ']', '{', '}', ' ' + ) /** * Check if a token is "blacklisted", meaning that we shouldn't form a query that starts with @@ -73,7 +81,7 @@ object Query { * considering different subsets of the words in the mention and munging capitalization and * stemming, since lowercasing and dropping a plural-marking "s" are useful for nominals. */ - def extractQueriesBest(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = { + def extractQueriesBest_old(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = { val queries = new ArrayBuffer[Query]; val mentWords = ment.words; // Try the whole query, then prefixes ending in the head @@ -107,6 +115,46 @@ object Query { // } queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]()); } + + def extractQueriesBest(ment: Mention, addNilQuery: Boolean = false): Seq[Query] = { + val queries = new ArrayBuffer[Query]() + val mentWords = ment.words + val relHeadIdx = ment.contextTree.getSpanHeadACECustom(ment.startIdx, ment.endIdx) - ment.startIdx + def addQuery(start: Int, end: Int, featsi:List[String]): Unit = { + var feats = featsi // gaaaaa + val thisSlice = new ArrayBuffer[Query]() + val wrds = mentWords.slice(start, end) + thisSlice += new Query(wrds, ment, (start, end), "STD", RemovePuncFromQuery, feats) + val firstWord = wrds(0) + val lastWord = wrds(wrds.size - 1) + if((end - start)== 1) + feats ++= List("SingleItemQuery") + if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) { + thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", RemovePuncFromQuery, feats); + } + // Stemming (but only on head alone) + if (PluralQueryExpand && (end - start) == 1 && firstWord.last == 's') { + thisSlice ++= thisSlice.map(qu => + new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", RemovePuncFromQuery, feats)); + } + queries ++= thisSlice + } + addQuery(0, ment.endIdx - ment.startIdx, List("SimpleQuery", "FullTextQuery")) + // TODO: make this ignore items that simply add a blacklisted word + for(i <- 0 to relHeadIdx) { + addQuery(i, relHeadIdx + 1, List("SimpleQuery", "PreHeadQuery")) + } + for(i <- relHeadIdx+1 until mentWords.size) { + addQuery(relHeadIdx, i, List("SimpleQuery", "PostHeadQuery")) + } + // try filtering words + val filterWords = mentWords.filter(!isBlacklisted(_, 0)) + if(filterWords.size != mentWords.size) { + // we lost something, make new query + queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", RemovePuncFromQuery, List("FilteredQuery")) + } + queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]()) + } def extractDenotationSetWithNil(queries: Seq[Query], queryDisambigs: Seq[Counter[String]], maxDenotations: Int): Seq[String] = { val choicesEachQuery = queryDisambigs.map(_.getSortedKeys().asScala); diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala index e3b4d32..90538cb 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala @@ -123,6 +123,8 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface, val longQuery = tagsWithin.size > 3; feat("DescriptorQueryTags=" + queryDescriptor + "-" + contextTag + (if (longQuery) "...") + tagsWithin.slice(Math.max(0, tagsWithin.size - 3), tagsWithin.size).toString); feat("DescriptorHead=" + queryDescriptor + "-" + binSize(querySize) + "-" + ment.headStringLc); + for(f <- query.features) + feat(f) feats.toArray; }); } diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala index ebeb0d3..88bcdd3 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala @@ -94,19 +94,31 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB, } def disambiguateBestGetAllOptions(ment: Mention, specifiedHeadIdx: Int) = { - auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr)))); + auxDB.purgeDisambiguationAll( + redirectsDB.followRedirectsCounter( + titleGivenSurfaceDB.disambiguateQueriesGetAllOptions( + Query.extractQueriesBest(ment).map(_.getFinalQueryStr)))); } def disambiguateBestGetAllOptions(query: Query) = { - auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(query.getFinalQueryStr)))); + auxDB.purgeDisambiguationAll( + redirectsDB.followRedirectsCounter( + titleGivenSurfaceDB.disambiguateQueriesGetAllOptions( + Seq(query.getFinalQueryStr)))); } def disambiguateBestGetAllReasonableOptions(ment: Mention, specifiedHeadIdx: Int) = { - auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllReasonableOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr)))); + auxDB.purgeDisambiguationAll( + redirectsDB.followRedirectsCounter( + titleGivenSurfaceDB.disambiguateQueriesGetAllReasonableOptions( + Query.extractQueriesBest(ment).map(_.getFinalQueryStr)))); } def disambiguateBestGetAllOneBestOptions(ment: Mention, specifiedHeadIdx: Int) = { - auxDB.purgeDisambiguationAll(redirectsDB.followRedirectsCounter(titleGivenSurfaceDB.disambiguateQueriesGetAllOneBestOptions(Query.extractQueriesBest(ment).map(_.getFinalQueryStr)))); + auxDB.purgeDisambiguationAll( + redirectsDB.followRedirectsCounter( + titleGivenSurfaceDB.disambiguateQueriesGetAllOneBestOptions( + Query.extractQueriesBest(ment).map(_.getFinalQueryStr)))); } def getCategories(title: String) = categoryDB.getCategories(title); From 88d05a29bb56e0b6f811762748614a043cece83f Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sat, 28 Mar 2015 14:27:07 -0700 Subject: [PATCH 17/25] some changes to trying to generate queries --- .../nlp/entity/wiki/JointQueryDenotationChooser.scala | 5 +++-- .../edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala | 6 +++++- .../nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index 43648c5..e248cbb 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -183,10 +183,11 @@ object JointQueryDenotationChooser { //val oldqueries = Query.extractQueriesBest_old(ment, true); val queries = Query.extractQueriesBest(ment, true); /*if(!(Set(oldqueries.map(_.getFinalQueryStr):_*) subsetOf Set(queries.map(_.getFinalQueryStr):_*))) { - println("failed") + println("failed...") }*/ - val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_)); + //val queryDisambigs = queries.map(wikiDB.disambiguateBestGetAllOptions(_)); // val denotations = queries.map(wikiDB.disambiguateBestNoDisambig(_)); + val queryDisambigs = queries.map(wikiDB.disambigRes(_)) val denotations = Query.extractDenotationSetWithNil(queries, queryDisambigs, maxNumWikificationOptions); val correctDenotations = denotations.filter(denotation => isCorrect(goldLabel, denotation)) // N.B. The use of "isCorrect" here is needed to canonicalize diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala index 88bcdd3..a6ab5b6 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala @@ -106,7 +106,11 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB, titleGivenSurfaceDB.disambiguateQueriesGetAllOptions( Seq(query.getFinalQueryStr)))); } - + + def disambigRes(query: Query) = { + titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(query.getFinalQueryStr)) + } + def disambiguateBestGetAllReasonableOptions(ment: Mention, specifiedHeadIdx: Int) = { auxDB.purgeDisambiguationAll( redirectsDB.followRedirectsCounter( diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala index 2445259..d41605a 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala @@ -12,6 +12,7 @@ import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap // Need to know all titles (including redirects) +// determins what surfaces values link to with a given count @SerialVersionUID(1L) class WikipediaTitleGivenSurfaceDB(val surfaceToTitle: CounterMap[String,String]) extends Serializable { val truecaseMap = new HashMap[String,ArrayBuffer[String]]; From b8a9e17707475af2f15cfb59a3c7d34dddb1f778 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Wed, 1 Apr 2015 10:31:28 -0700 Subject: [PATCH 18/25] lot more queries being generated, but about 1/3 as many impossible queries --- build.sbt | 9 +- .../wiki/JointQueryDenotationChooser.scala | 4 +- .../edu/berkeley/nlp/entity/wiki/Query.scala | 15 ++- .../nlp/entity/wiki/WikipediaInterface.scala | 26 +++- .../entity/wiki/WikipediaInterface_db.scala | 127 ++++++++++++++++++ .../wiki/WikipediaTitleGivenSurfaceDB.scala | 8 +- 6 files changed, 176 insertions(+), 13 deletions(-) create mode 100644 src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala diff --git a/build.sbt b/build.sbt index a3fe7b7..cbfa110 100644 --- a/build.sbt +++ b/build.sbt @@ -4,10 +4,17 @@ name := "berkeley-entity" version := "1" -scalaVersion := "2.11.2" +scalaVersion := "2.11.6" assemblySettings mainClass in assembly := Some("edu.berkeley.nlp.entity.Driver") unmanagedResourceDirectories in Compile += { baseDirectory.value / "resources/" } + +libraryDependencies ++= Seq( + "org.scalikejdbc" %% "scalikejdbc" % "2.2.5", + "com.h2database" % "h2" % "1.4.186", + "ch.qos.logback" % "logback-classic" % "1.1.2", + "org.postgresql" % "postgresql" % "9.4-1201-jdbc41" +) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index e248cbb..a2bc630 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -196,9 +196,9 @@ object JointQueryDenotationChooser { if (filterImpossible && correctIndices.isEmpty) { numImpossible += 1; println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations) - if(goldLabel.contains("Lord_Speaker")) { + /*if(goldLabel.contains("Lord_Speaker")) { println("wtfwtf") - } + }*/ } else { exs += new JointQueryDenotationExample(queries, denotations, correctDenotations, goldLabel) } diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala index e7f6c56..71a1869 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/Query.scala @@ -100,6 +100,7 @@ object Query { if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) { queriesThisSlice += new Query(Seq(wikiCase(firstWord)) ++ mentWords.slice(indices._1 + 1, indices._2), ment, indices, "WIKICASED", RemovePuncFromQuery); } + // Stemming (but only on head alone) if (PluralQueryExpand && (indices._2 - indices._1) == 1 && firstWord.last == 's') { queriesThisSlice ++= queriesThisSlice.map(query => new Query(Seq(removePlural(query.words(0))), ment, indices, query.queryType + "-STEM", RemovePuncFromQuery)); @@ -124,18 +125,23 @@ object Query { var feats = featsi // gaaaaa val thisSlice = new ArrayBuffer[Query]() val wrds = mentWords.slice(start, end) - thisSlice += new Query(wrds, ment, (start, end), "STD", RemovePuncFromQuery, feats) + thisSlice += new Query(wrds, ment, (start, end), "STD", true, feats ++ List("RemovedPunc")) + thisSlice += new Query(wrds, ment, (start, end), "STD", false, feats ++ List("IncludePunc")) val firstWord = wrds(0) val lastWord = wrds(wrds.size - 1) if((end - start)== 1) feats ++= List("SingleItemQuery") if (!firstWord.map(Character.isUpperCase(_)).reduce(_ || _) && Character.isLowerCase(firstWord(0))) { - thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", RemovePuncFromQuery, feats); + thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", true, feats ++ List("RemovedPunc")); + thisSlice += new Query(Seq(wikiCase(firstWord)) ++ wrds.drop(1), ment, (start, end), "WIKICASED", false, feats ++ List("IncludePunc")); } // Stemming (but only on head alone) if (PluralQueryExpand && (end - start) == 1 && firstWord.last == 's') { thisSlice ++= thisSlice.map(qu => - new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", RemovePuncFromQuery, feats)); + new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", true, feats ++ List("RemovedPunc"))); + thisSlice ++= thisSlice.map(qu => + new Query(Seq(removePlural(qu.words(0))), ment, (start, end), qu.queryType + "-STEM", false, feats ++ List("IncludePunc"))); + } queries ++= thisSlice } @@ -151,7 +157,8 @@ object Query { val filterWords = mentWords.filter(!isBlacklisted(_, 0)) if(filterWords.size != mentWords.size) { // we lost something, make new query - queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", RemovePuncFromQuery, List("FilteredQuery")) + queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", true , List("FilteredQuery", "RemovedPunc")) + queries += new Query(filterWords, ment, (ment.startIdx, ment.endIdx), "FIT", false, List("FilteredQuery", "IncludePunc")) } queries.filter(!_.getFinalQueryStr.isEmpty) ++ (if (addNilQuery) Seq(Query.makeNilQuery(ment)) else Seq[Query]()) } diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala index a6ab5b6..bfe6325 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala @@ -74,7 +74,9 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB, def disambiguate(ment: Mention) = disambiguateBest(ment, ment.headIdx) def disambiguateBest(ment: Mention, specifiedHeadIdx: Int) = { - redirectsDB.followRedirect(titleGivenSurfaceDB.disambiguateQueries(Query.extractQueriesBest(ment).map(_.getFinalQueryStr))); + redirectsDB.followRedirect( + titleGivenSurfaceDB.disambiguateQueries( + Query.extractQueriesBest(ment).map(_.getFinalQueryStr))); } def disambiguateBestNoDisambig(query: Query) = { @@ -107,10 +109,26 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB, Seq(query.getFinalQueryStr)))); } + def merge[T](a: Counter[T], b: Counter[T]) = { + for(k <- a.keySet().asScala) { + b.incrementCount(k, a.getCount(k)) + } + } + def disambigRes(query: Query) = { - titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(query.getFinalQueryStr)) + val str = query.getFinalQueryStr + var titles = titleGivenSurfaceDB.disambiguateQueriesGetAllOptions(Seq(str)) + titles.incrementCount(str, 1.0) + var redirs = redirectsDB.followRedirectsCounter(titles) + merge(titles, redirs) + //var aux = auxDB.purgeDisambiguationAll(redirs) + //merge(redirs, aux) + //aux + redirs } + + def disambiguateBestGetAllReasonableOptions(ment: Mention, specifiedHeadIdx: Int) = { auxDB.purgeDisambiguationAll( redirectsDB.followRedirectsCounter( @@ -264,7 +282,9 @@ object WikipediaInterface { // val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet; // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents. - val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr)).toSet; + val queries = corefDocs.flatMap(_.predMentions/*.filter(!_.mentionType.isClosedClass)*/) + .flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr)) + .toSet; Logger.logss("Extracted " + queries.size + " queries from " + corefDocs.size + " documents"); val interface = if (WikipediaInterface.categoryDBInputPath != "") { val categoryDB = GUtil.load(WikipediaInterface.categoryDBInputPath).asInstanceOf[WikipediaCategoryDB]; diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala new file mode 100644 index 0000000..6183546 --- /dev/null +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala @@ -0,0 +1,127 @@ +package edu.berkeley.nlp.entity.wiki + +import edu.berkeley.nlp.entity.GUtil +import edu.berkeley.nlp.futile.LightRunner +import edu.berkeley.nlp.futile.fig.basic.Indexer +import edu.berkeley.nlp.futile.util.CounterMap +import scalikejdbc._ + +import scala.collection.mutable + +/** + * Created by matthewfl + */ +class WikipediaInterface_db (conn : String) { + + + Class.forName("org.postgresql.Driver") + val settings = ConnectionPoolSettings( + initialSize = 1, + maxSize = 8, + connectionTimeoutMillis = 3000L, + validationQuery = "select 1") + ConnectionPool.add(this, conn, "wiki", "wiki", settings) + + /*val i1 : Option[Int] = using(DB(ConnectionPool.borrow(this))) { db => + db localTx { implicit session => + SQL("select 5 as i").map(r=>r.get[Int](1)).single.apply() + } + } + + println("value of il: "+i1.get) +*/ + + + def disambigRes(query: Query) = { + Seq[String]() + } + + def TitlesGivenSurface = { + var m = new CounterMap[String, String]() + using(DB(ConnectionPool.borrow(this))) { db => { + db localTx { implicit session => { + SQL("select surface_text, page_title, count(*) as cnt from links inner join page on page_latest = to_id group by surface_text, page_title") + .fetchSize(5000) + .foreach(res => { + m.incrementCount(res.string("surface_text"), res.string("page_title"), res.int("cnt")) + }) + }} + }} + new WikipediaTitleGivenSurfaceDB(m) + } + + def Redirects = { + val m = new mutable.HashMap[String,String]() + using(DB(ConnectionPool.borrow(this))) { db => + db localTx { implicit session => { + SQL( + """select pf.page_title as from_page, pt.page_title as to_page + from page pf inner join links on links.from_id = pf.page_latest + inner join page pt on links.to_id = pt.page_latest + where pf.page_is_redirect = 1 limit 10000""" + ).fetchSize(5000) + .foreach(res => { + println("loading redirect "+res.string("from_page")) + m += (res.string("from_page") -> res.string("to_page")) + }) + }} + } + new WikipediaRedirectsDB(m) + } + + def Links = { + // TODO: + val ind = new Indexer[String]() + + null.asInstanceOf[WikipediaLinkDB] + } + + def Aux = { + null.asInstanceOf[WikipediaAuxDB] + } + +} + + +object WikipediaInterface_db { + + // database connection string + val conn = "jdbc:postgresql://10.7.0.17/wiki" + + // most stuff should come out of the db + val wikipediaPath = "" + + val categoryDBInputPath = "" + val categoryDBOutputPath = "" + + val outputPath = "" + + def main(args : Array[String]): Unit = { + + LightRunner.initializeOutput(WikipediaInterface_db.getClass); + LightRunner.populateScala(WikipediaInterface_db.getClass, args); + + + var db = new WikipediaInterface_db(conn) + + val catDB = if(!categoryDBInputPath.isEmpty) { + GUtil.load(categoryDBInputPath).asInstanceOf[WikipediaCategoryDB] + } else { + // this is really slow to make the cat database, you should want to avoid this + assert(false) + null.asInstanceOf[WikipediaCategoryDB] + } + + val wi = new WikipediaInterface(db.TitlesGivenSurface, db.Redirects, catDB, db.Links, db.Aux) + + GUtil.save(wi, outputPath) + + if (categoryDBOutputPath != "") { + GUtil.save(catDB, categoryDBOutputPath); + } + LightRunner.finalizeOutput(); + + // going to punt on the links db, as it appears that it is not being used + + } +} \ No newline at end of file diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala index d41605a..deaf7d4 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTitleGivenSurfaceDB.scala @@ -90,7 +90,9 @@ object WikipediaTitleGivenSurfaceDB { val PuncList = Set(',', '.', '!', '?', ':', ';', '\'', '"', '(', ')', '[', ']', '{', '}', ' '); def isGoodTitle(str: String) = !str.contains("#") && !str.contains(":") && !str.contains("Wikipedia") && !str.startsWith("List of") && !str.startsWith("List_of"); - + + // this is using the set of generated queries to determine which are the best items to extract + // / limit the size of the extracted wiki data def processWikipedia(wikipediaPath: String, querySet: Set[String]): WikipediaTitleGivenSurfaceDB = { val lowercase = false; val surfaceToTitle = new CounterMap[String,String]; @@ -130,13 +132,13 @@ object WikipediaTitleGivenSurfaceDB { Logger.logss(querySet.size + " queries, " + counter + " lines processed, " + surfaceToTitle.size + " surface strings found, " + surfaceToTitle.totalCount + " total count"); // .toSeq here to avoid a ConcurrentModificationException - for (key <- surfaceToTitle.keySet.asScala.toSeq) { + /*for (key <- surfaceToTitle.keySet.asScala.toSeq) { surfaceToTitle.getCounter(key).pruneKeysBelowThreshold(1.5); surfaceToTitle.getCounter(key).removeKey(""); if (surfaceToTitle.getCounter(key).isEmpty) { surfaceToTitle.removeKey(key); } - } + }*/ new WikipediaTitleGivenSurfaceDB(surfaceToTitle); } From 4ce0c436fb19d959c96a990d6f16a497489089f7 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Fri, 3 Apr 2015 09:33:56 -0700 Subject: [PATCH 19/25] attempt to include gold data when extracting useful components from wikipedia --- .../nlp/entity/wiki/WikipediaInterface.scala | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala index bfe6325..f61df88 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala @@ -193,6 +193,8 @@ object WikipediaInterface { val categoryDBInputPath = ""; val categoryDBOutputPath = ""; + + val wikiStandoff = ""; def processWikipedia(wikipediaPath: String, queries: Set[String], parser: CoarseToFineMaxRuleParser, backoffParser: CoarseToFineMaxRuleParser): WikipediaInterface = { val titleGivenSurface = WikipediaTitleGivenSurfaceDB.processWikipedia(wikipediaPath, queries); @@ -282,9 +284,21 @@ object WikipediaInterface { // val queries = corefDocs.flatMap(_.predMentions.filter(!_.mentionType.isClosedClass)).flatMap(ment => WikipediaTitleGivenSurfaceDB.extractQueries(ment, ment.headIdx)).toSet; // MFL TODO: this is the queries that will have to be rewritten to support the wiki documents. - val queries = corefDocs.flatMap(_.predMentions/*.filter(!_.mentionType.isClosedClass)*/) + var queries = corefDocs.flatMap(_.predMentions/*.filter(!_.mentionType.isClosedClass)*/) .flatMap(ment => Query.extractQueriesBest(ment).map(_.getFinalQueryStr)) .toSet; + // some of the gold titles in the older dataset link to current redirect pages + // so we are loading them here so we can normalize the redirects when performing training/testing + val golds : Set[String] = if(!wikiStandoff.isEmpty) { + WikiAnnotReaderWriter.readStandoffAnnotsAsCorpusAnnots(wikiStandoff).flatMap(d => { + d._2.flatMap(v => { + v._2.flatMap(_.label).map(_.replace("_"," ")) + }) + }).toSet + } else { + Set[String]() + } + queries = queries ++ golds Logger.logss("Extracted " + queries.size + " queries from " + corefDocs.size + " documents"); val interface = if (WikipediaInterface.categoryDBInputPath != "") { val categoryDB = GUtil.load(WikipediaInterface.categoryDBInputPath).asInstanceOf[WikipediaCategoryDB]; From 9f3752e632054d6edd7f0f601426fff5c5b6328d Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Thu, 9 Apr 2015 11:21:23 -0700 Subject: [PATCH 20/25] better printing, and fixes to links db --- .../wiki/JointQueryDenotationChooser.scala | 59 ++++++++++++---- .../nlp/entity/wiki/WikipediaInterface.scala | 4 +- .../nlp/entity/wiki/WikipediaLinkDB.scala | 70 ++++++++++++------- 3 files changed, 90 insertions(+), 43 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index a2bc630..6c4bbd4 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -143,19 +143,40 @@ class JointQueryDenotationChooser(val featureIndexer: Indexer[String], computer.computeDenotation(ex, weights) }*/ - def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : Seq[String] = { + def pickDenotations(queries: Seq[Query], wikiDB: WikipediaInterface) : (Seq[(String, Int)], Array[Array[Int]]) = { val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer); val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query)); val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions) val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]()); val denotationMarginals = computer.getDenotationLogMarginals(ex, weights) - ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse.map(_._1) + (ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse, + ex.cachedFeatsEachQuery) } - def diffFeatures(correct: Query, choosen: Query, wikiDB: WikipediaInterface) = { + def printEverything(queries: Seq[Query], wikiDB: WikipediaInterface, correctInd: Int) = { + // just redo the computations so gg + val computer = new JointQueryDenotationChoiceComputer(wikiDB, featureIndexer); + val denotations = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query)); + val dden = Query.extractDenotationSetWithNil(queries, denotations, JointQueryDenotationChooser.maxNumWikificationOptions) + val ex = new JointQueryDenotationExample(queries, dden, Array[String](), Array[String]()); + val denotationMarginals = computer.getDenotationLogMarginals(ex, weights) + val sortedItms = ex.allDenotations.zipWithIndex.sortBy(v => denotationMarginals(v._2)).reverse + + println( + s"""Correct item in $correctInd (${sortedItms(correctInd)._1}) + |\tGuessed value: ${sortedItms(0)._1}""".stripMargin) + for(i <- 0 until queries.length) { + println("\t\t"+i+": "+queries(i)) + println("\t\t"+ex.cachedFeatsEachQuery(i).map(featureIndexer.getObject(_)).mkString(" ")) + for(j <- 0 until ex.allDenotations.length) { + println("\t\t\t"+j+": "+ex.allDenotations(j)+": "+ex.cachedFeatsEachQueryDenotation(i)(j).map(featureIndexer.getObject(_)).mkString(" ")) + } + } + println() } + } object JointQueryDenotationChooser { @@ -195,7 +216,7 @@ object JointQueryDenotationChooser { // if (correctIndices.isEmpty && if (filterImpossible && correctIndices.isEmpty) { numImpossible += 1; - println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations) + //println("impossible: "+goldLabel +"\n\tqueries: "+queries+"\n\tdisamb: "+queryDisambigs+"\n\tdentations: "+denotations) /*if(goldLabel.contains("Lord_Speaker")) { println("wtfwtf") }*/ @@ -286,8 +307,8 @@ object JointQueryDenotationChooser { val results = testExs.map(t => { // TODO: need more then one perdicted title - val picks = chooser.pickDenotations(t.queries, wikiDB) - if(!isCorrect(t.rawCorrectDenotations, picks(0))) { + val (picks, denFeats) = chooser.pickDenotations(t.queries, wikiDB) + if(!isCorrect(t.rawCorrectDenotations, picks(0)._1)) { // the pick is not correct, attempt to determine if there would have // been a better pick that is in the picks list (which basically means all of the /*if(picks.size > 1 && isCorrect(t.rawCorrectDenotations, picks(1))) { @@ -296,21 +317,31 @@ object JointQueryDenotationChooser { println("second pick was correct") }*/ - var qq = false - for((p, i) <- picks.drop(1).zipWithIndex) { + var qq = -1 + for((p, i) <- picks.zipWithIndex) { // try: t.correctDenotations here? - if(isCorrect(t.correctDenotations, p) || isCorrect(t.rawCorrectDenotations, p)) { - println("Found correct item with "+i) + if(isCorrect(t.correctDenotations, p._1) || isCorrect(t.rawCorrectDenotations, p._1)) { + //println("Found correct item with "+i) correctItemWasInSet += 1 - qq = true + qq = i //println("found correct item") } } - if(!qq) { - println("???") + if(qq != -1) { + chooser.printEverything(t.queries, wikiDB, qq) + /*println( + s"""Correct item in place: $qq + |\tcorrect value: ${picks(qq)} + |\t\t${denFeats(picks(qq)._2).flatMap(featIndexer.getObject(_)).mkString(" ")} + |\tchosen value : ${picks(0)} + |\t\t${denFeats(picks(0)._2).flatMap(featIndexer.getObject(_)).mkString(" ")} + """.stripMargin) +*/ + } else { + println("THIS QUERY SHOULD HAVE BEEN FILTERED") } } - (t.rawCorrectDenotations, picks, t.queries(0).originalMent.rawDoc) + (t.rawCorrectDenotations, picks.map(_._1), t.queries(0).originalMent.rawDoc) }) val goldTestDenotationsAsTrivialChunks = (0 until results.size).map(i => new Chunk[Seq[String]](i, i+1, results(i)._1)) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala index f61df88..223cabf 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala @@ -203,7 +203,7 @@ object WikipediaInterface { val links = if (WikipediaInterface.computeLinkDB) { WikipediaLinkDB.processWikipedia(wikipediaPath, allPageTargetsLc); } else { - new WikipediaLinkDB(new Indexer[String], new HashMap[String,Array[Int]], new HashMap[String,Array[Int]]); + new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]); } val categories = WikipediaCategoryDB.processWikipedia(wikipediaPath, allPageTargetsLc, parser, backoffParser); val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc); @@ -219,7 +219,7 @@ object WikipediaInterface { val links = if (WikipediaInterface.computeLinkDB) { WikipediaLinkDB.processWikipedia(wikipediaPath, allPageTargetsLc); } else { - new WikipediaLinkDB(new Indexer[String], new HashMap[String,Array[Int]], new HashMap[String,Array[Int]]); + new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]); } val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc); val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux); diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala index f2f1f6a..e8800c0 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala @@ -1,6 +1,7 @@ package edu.berkeley.nlp.entity.wiki import edu.berkeley.nlp.futile.fig.basic.Indexer +import scala.collection.mutable import scala.collection.mutable.HashMap import scala.collection.mutable.ArrayBuffer import edu.berkeley.nlp.futile.fig.basic.IOUtils @@ -14,33 +15,36 @@ import edu.berkeley.nlp.entity.lang.Language import edu.berkeley.nlp.entity.wiki._ @SerialVersionUID(9084163557546777842L) -class WikipediaLinkDB(val pageNameIndex: Indexer[String], - val inLinksMap: HashMap[String,Array[Int]], - val outLinksMap: HashMap[String,Array[Int]]) extends Serializable { - var outLinksSetCache: HashMap[String,Set[Int]] = null; +class WikipediaLinkDB(private val pageNameIndex: Indexer[String], + private val inLinksMap: HashMap[Int,Array[Int]], + private val outLinksMap: HashMap[Int,Array[Int]]) extends Serializable { + @transient + var outLinksSetCache = new mutable.HashMap[String,Set[Int]]() def getOutLinks(title: String) = { - if (outLinksMap.contains(title)) { - outLinksMap(title); + val k = pageNameIndex.indexOf(title) + if (outLinksMap.contains(k)) { + outLinksMap(k); } else { Array[Int](); } } - def getOutLinksSetUseCache(title: String) = { - if (outLinksMap.contains(title)) { - if (outLinksSetCache == null) { - outLinksSetCache = new HashMap[String,Set[Int]]; - } - if (!outLinksSetCache.contains(title)) { + def getOutLinksSetUseCache(title: String) : Set[Int] = { + if(outLinksSetCache.contains(title)) { + outLinksSetCache(title) + } else { + val k = pageNameIndex.indexOf(title) + if(k != -1) { if (outLinksSetCache.size > 1000) { outLinksSetCache.dropRight(1); } - outLinksSetCache.put(title, outLinksMap(title).toSet); + val s = outLinksMap(k).toSet + outLinksSetCache.put(title, s) + s + } else { + Set[Int]() } - outLinksSetCache(title); - } else { - Set[Int](); } } @@ -56,9 +60,11 @@ class WikipediaLinkDB(val pageNameIndex: Indexer[String], } def doesOneLinkToOther(title1: String, title2: String): Boolean = { + val ti1 = pageNameIndex.indexOf(title1) + val ti2 = pageNameIndex.indexOf(title2) val outLinksTitle1 = getOutLinks(title1); val outLinksTitle2 = getOutLinks(title2); - outLinksTitle1.contains(pageNameIndex.indexOf(title2)) || outLinksTitle2.contains(pageNameIndex.indexOf(title1)) + outLinksTitle1.contains(ti2) || outLinksTitle2.contains(ti1) } } @@ -66,18 +72,19 @@ object WikipediaLinkDB { def processWikipedia(wikipediaPath: String, pageTitleSetLc: Set[String]): WikipediaLinkDB = { val pageNamesIndex = new Indexer[String]; - val inLinksMap = new HashMap[String,HashSet[Int]]; - val outLinksMap = new HashMap[String,HashSet[Int]]; + val inLinksMap = new HashMap[Int,HashSet[Int]]; + val outLinksMap = new HashMap[Int,HashSet[Int]]; val lines = IOUtils.lineIterator(IOUtils.openInHard(wikipediaPath)); var currentPageTitle = ""; - var linksThisPage = new StringBuilder(); + var currentPageTitleind = 0 + //var linksThisPage = new StringBuilder(); var doneWithThisPage = false; var numPagesSeen = 0; var lineIdx = 0; - var isInText = false; - val categoryMap = new HashMap[String,ArrayBuffer[String]]; - val infoboxMap = new HashMap[String,String]; - val appositiveMap = new HashMap[String,String]; + //var isInText = false; + //val categoryMap = new HashMap[String,ArrayBuffer[String]]; + //val infoboxMap = new HashMap[String,String]; + //val appositiveMap = new HashMap[String,String]; // Extract first line that's not in brackets while (lines.hasNext) { val line = lines.next; @@ -96,6 +103,7 @@ object WikipediaLinkDB { } else if (line.contains("")) { // 7 = "<title>".length() currentPageTitle = line.substring(line.indexOf("<title>") + 7, line.indexOf("")); + currentPageTitleind = pageNamesIndex.getIndex(currentPageTitle) if (!pageTitleSetLc.contains(currentPageTitle.toLowerCase)) { doneWithThisPage = true; } @@ -115,15 +123,23 @@ object WikipediaLinkDB { } if (linkDest != "") { val idx = pageNamesIndex.getIndex(linkDest); - if (!outLinksMap.contains(currentPageTitle)) { - outLinksMap.put(currentPageTitle, new HashSet[Int]); + if (!outLinksMap.contains(currentPageTitleind)) { + outLinksMap.put(currentPageTitleind, new HashSet[Int]); } - outLinksMap(currentPageTitle) += idx; + outLinksMap(currentPageTitleind) += idx; } startIdx = line.indexOf("[[", startIdx + 2); } } } + outLinksMap.foreach(a => { + a._2.foreach(b => { + if(!inLinksMap.contains(b)) { + inLinksMap.put(b, new mutable.HashSet[Int]) + } + inLinksMap(b) += a._1 + }) + }) val inLinksMapArrs = inLinksMap.map(entry => entry._1 -> entry._2.toArray); // TODO: WTF: inlinksmap is never written to val outLinksMapArrs = outLinksMap.map(entry => entry._1 -> entry._2.toArray); val sizes = Array.tabulate(10)(i => 0); From 83ad9542815900e3455d6cad57019a2669bb5d53 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sun, 12 Apr 2015 22:32:27 -0700 Subject: [PATCH 21/25] failed attempt to simply add global wikification features, going to need a classifier that is aware about other choices in the document --- .../wiki/JointQueryDenotationChooser.scala | 2 +- .../nlp/entity/wiki/QueryChooser.scala | 150 ++++++++++++++++++ .../nlp/entity/wiki/WikipediaLinkDB.scala | 52 +++++- .../entity/wiki/WikipediaRedirectsDB.scala | 1 + 4 files changed, 199 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index 6c4bbd4..9b206d5 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -44,7 +44,7 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface, def featurizeUseCache(ex: JointQueryDenotationExample, addToIndexer: Boolean) { if (ex.cachedFeatsEachQuery == null) { ex.cachedFeatsEachQuery = queryChooser.featurizeQueries(ex.queries, addToIndexer) - ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations(ex.queries, ex.allDenotations, addToIndexer) + ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations_GLOW(ex.queries, ex.allDenotations, addToIndexer, wikiDB) } } diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala index 90538cb..de02806 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala @@ -18,6 +18,8 @@ import edu.berkeley.nlp.entity.ConllDocReader import edu.berkeley.nlp.entity.coref.CorefDocAssembler import edu.berkeley.nlp.entity.coref.MentionPropertyComputer +import scala.collection.mutable + case class QueryChoiceExample(val queries: Seq[Query], val denotations: Seq[String], val correctQueryIndices: Array[Int]) { @@ -128,6 +130,154 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface, feats.toArray; }); } + + def getDentationLinksSets(denotations: Seq[String], wikiDB: WikipediaInterface) : (Seq[Set[Int]], Seq[Set[Int]]) = { + (denotations.map(wikiDB.linksDB.getInLinksSetUseCache(_)), denotations.map(wikiDB.linksDB.getOutLinksSetUseCache(_))) + } + + val logsv = (0 until 3000).map(Math.log(_)) + + def logs(i: Int) = { + if(i < logsv.size) + logsv(i) + else + Math.log(i) + } + + def unionSize[T](ss: Set[T]*) = { + val ns = new mutable.HashSet[T]() + for(s <- ss) { + ns ++= s + } + ns.size + } + + def intersectSize[T](a: Set[T], b: Set[T]) = { + var smaller: Set[T] = a + var larger: Set[T] = b + if(a.size > b.size) { + larger = a + smaller = b + } + var ret = 0 + for(i <- smaller) { + if(larger.contains(i)) + ret += 1 + } + ret + } + + def NGD[T](a: Set[T], b: Set[T], wsize: Int) : Double = { + (logs(math.max(a.size, b.size)) - logs(intersectSize(a,b))) / + (logs(wsize) - logs(math.min(a.size,b.size))) + } + + def PMI[T](a: Set[T], b: Set[T], wsize: Int) : Double = { + // TODO: ? the use of wsize here does not make since + // must be misunderstanding something + (intersectSize(a,b) * wsize).asInstanceOf[Float] / (a.size * b.size) + } + + def GLOWfeatures[T](fn: (Set[T], Set[T], Int) => Double, refs: Seq[Set[T]], prefix: String): Seq[Array[String]] = { + val rsize = refs.size + val wsize = unionSize(refs:_*) + var max = Double.NegativeInfinity + var avg = 0.0 + // TODO: rank the items in the list + //val valList = new mutable.MutableList[Double]() + val cache = new mutable.HashMap[Int,Double] { + override def initialSize: Int = rsize*rsize + } + for(a <- 0 until rsize; b <- 0 until rsize) { + if(a != b) { + val v = fn(refs(a), refs(b), wsize) + cache.put(a + b*65536, v) + if(v > max) + max = v + //valList += v + avg += v + } + } + avg /= (rsize * (rsize - 1)) + for(a <- 0 until rsize) yield { + var isInMax = false + var isAboveAvg = false + var isAboveAvg2 = false + for(b <- 0 until rsize) { + if(a != b) { + //val v = fn(refs(a),refs(b),wsize) + val v : Double = cache.getOrElse(a + b*65536, 0.0) + if(v == max) { + isInMax = true + } + if(v > avg) { + isAboveAvg = true + } + if(v > (avg * 2)) { + isAboveAvg2 = true + } + } + } + val r = new ArrayBuffer[String] + if(isInMax) + r += prefix + "IsInMax" + if(isAboveAvg) + r += prefix + "isAboveAvg" + if(isAboveAvg2) + r += prefix + "isAboveAvg2" + r.toArray + } + } + + def featurizeQueriesAndDenotations_GLOW(queries: Seq[Query], denotations: Seq[String], addToIndexer: Boolean, wikiDB: WikipediaInterface): Array[Array[Array[Int]]] = { + val queryOutcomes = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query)); + val queryNonemptyList = queryOutcomes.map(_.isEmpty); + val ment = queries.head.originalMent; + val mentUpToHeadSize = ment.headIdx - ment.startIdx + 1; + val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB) + + val PMINGDvals = Seq( + GLOWfeatures[Int](PMI, refLinksIn, "PMI-in-"), + GLOWfeatures[Int](NGD, refLinksIn, "NGD-in-"), + GLOWfeatures[Int](PMI, refLinksOut, "PMI-out-"), + GLOWfeatures[Int](NGD, refLinksOut, "NGD-out-") + ) + // TODO: this is not correct,..... + + + Array.tabulate(queries.size, denotations.size)((queryIdx, denIdx) => { + val feats = new ArrayBuffer[Int]; + def feat(str: String) = addFeat(str, feats, addToIndexer); + for(p <- PMINGDvals) + for(f <- p(denIdx)) + feat(f) + val query = queries(queryIdx); + val den = denotations(denIdx); + if (den == NilToken) { + feat("NilAndQueryNonempty=" + queryNonemptyList(queryIdx)); + } else if (queryOutcomes(queryIdx).containsKey(den)) { + val queryDescriptorWithProper = (if (ment.pos(ment.headIdx - ment.startIdx) == "NNP") "PROP" else "NOM") + "-" + query.queryType; + val queryRank = queryOutcomes(queryIdx).getSortedKeys().indexOf(den); + feat("Rank=" + queryDescriptorWithProper + "-" + (queryRank + 1)) + val queryStr = query.getFinalQueryStr; + val matchesQuery = den.toLowerCase == queryStr.toLowerCase; + feat("MatchesQuery=" + queryDescriptorWithProper + "-" + matchesQuery) + if (!matchesQuery) { + feat("ContainsQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.contains(queryStr.toLowerCase))); + feat("StartsWithQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.startsWith(queryStr.toLowerCase))); + feat("EndsWithQuery=" + queryDescriptorWithProper + "-" + (den.toLowerCase.endsWith(queryStr.toLowerCase))); + } + val denotationHasParenthetical = den.contains("(") && den.endsWith(")"); + feat("ContainsParenthetical=" + queryDescriptorWithProper + "-" + denotationHasParenthetical); + if (denotationHasParenthetical) { + feat("MatchesQueryUpToParen=" + queryDescriptorWithProper + "-" + (den.substring(0, den.indexOf("(")).trim.toLowerCase == queryStr.toLowerCase)) + } + } else { + feat("Impossible"); + } + feats.toArray; + }); + } def featurizeQueriesAndDenotations(queries: Seq[Query], denotations: Seq[String], addToIndexer: Boolean): Array[Array[Array[Int]]] = { val queryOutcomes = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query)); diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala index e8800c0..d2b00cb 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaLinkDB.scala @@ -19,8 +19,11 @@ class WikipediaLinkDB(private val pageNameIndex: Indexer[String], private val inLinksMap: HashMap[Int,Array[Int]], private val outLinksMap: HashMap[Int,Array[Int]]) extends Serializable { @transient - var outLinksSetCache = new mutable.HashMap[String,Set[Int]]() - + private var outLinksSetCache : mutable.HashMap[String,Set[Int]] = null + + @transient + private var inLinksSetCache : mutable.HashMap[String,Set[Int]] = null + def getOutLinks(title: String) = { val k = pageNameIndex.indexOf(title) if (outLinksMap.contains(k)) { @@ -29,17 +32,51 @@ class WikipediaLinkDB(private val pageNameIndex: Indexer[String], Array[Int](); } } - + + def getInLinks(title: String) = { + val k = pageNameIndex.indexOf(title) + if(inLinksMap.contains(k)) { + inLinksMap(k) + } else { + Array[Int]() + } + } + + def getInLinksSetUseCache(title: String) : Set[Int] = { + if(inLinksSetCache == null) { + inLinksSetCache = new mutable.HashMap[String,Set[Int]]() + } + if(inLinksSetCache.contains(title)) { + inLinksSetCache(title) + } else { + val k = pageNameIndex.indexOf(title) + if(k != -1) { + if (inLinksSetCache.size > 1000) { + inLinksSetCache = new mutable.HashMap[String,Set[Int]]() + } + val s = inLinksMap.getOrElse(k, Array[Int]()).toSet + inLinksSetCache.put(title, s) + s + } else { + Set[Int]() + } + } + } + def getOutLinksSetUseCache(title: String) : Set[Int] = { + if(outLinksSetCache == null) { + outLinksSetCache = new mutable.HashMap[String,Set[Int]]() + } if(outLinksSetCache.contains(title)) { outLinksSetCache(title) } else { val k = pageNameIndex.indexOf(title) if(k != -1) { if (outLinksSetCache.size > 1000) { - outLinksSetCache.dropRight(1); + // dropping one item was taking too long + outLinksSetCache = new mutable.HashMap[String,Set[Int]]() } - val s = outLinksMap(k).toSet + val s = outLinksMap.getOrElse(k, Array[Int]()).toSet outLinksSetCache.put(title, s) s } else { @@ -108,6 +145,11 @@ object WikipediaLinkDB { doneWithThisPage = true; } } else if (line.contains("")) + val idx = pageNamesIndex.getIndex(linkDest) + val hs = new HashSet[Int] + hs.add(idx) + outLinksMap.put(currentPageTitleind, hs) doneWithThisPage = true; } var startIdx = line.indexOf("[["); diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala index 654f8fa..dbc4e88 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaRedirectsDB.scala @@ -48,6 +48,7 @@ object WikipediaRedirectsDB { val CapitalizeInitial = true; def removeWeirdMarkup(str: String) = { + // TODO: this is a slow method, don't use str.replace("'", "'"); } From 5b5a5d7ba50ec6037e7350256049c00d53f81106 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sun, 12 Apr 2015 22:35:06 -0700 Subject: [PATCH 22/25] remove unused sql attempt --- build.sbt | 7 - .../entity/wiki/WikipediaInterface_db.scala | 127 ------------------ 2 files changed, 134 deletions(-) delete mode 100644 src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala diff --git a/build.sbt b/build.sbt index cbfa110..77738fd 100644 --- a/build.sbt +++ b/build.sbt @@ -11,10 +11,3 @@ assemblySettings mainClass in assembly := Some("edu.berkeley.nlp.entity.Driver") unmanagedResourceDirectories in Compile += { baseDirectory.value / "resources/" } - -libraryDependencies ++= Seq( - "org.scalikejdbc" %% "scalikejdbc" % "2.2.5", - "com.h2database" % "h2" % "1.4.186", - "ch.qos.logback" % "logback-classic" % "1.1.2", - "org.postgresql" % "postgresql" % "9.4-1201-jdbc41" -) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala deleted file mode 100644 index 6183546..0000000 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface_db.scala +++ /dev/null @@ -1,127 +0,0 @@ -package edu.berkeley.nlp.entity.wiki - -import edu.berkeley.nlp.entity.GUtil -import edu.berkeley.nlp.futile.LightRunner -import edu.berkeley.nlp.futile.fig.basic.Indexer -import edu.berkeley.nlp.futile.util.CounterMap -import scalikejdbc._ - -import scala.collection.mutable - -/** - * Created by matthewfl - */ -class WikipediaInterface_db (conn : String) { - - - Class.forName("org.postgresql.Driver") - val settings = ConnectionPoolSettings( - initialSize = 1, - maxSize = 8, - connectionTimeoutMillis = 3000L, - validationQuery = "select 1") - ConnectionPool.add(this, conn, "wiki", "wiki", settings) - - /*val i1 : Option[Int] = using(DB(ConnectionPool.borrow(this))) { db => - db localTx { implicit session => - SQL("select 5 as i").map(r=>r.get[Int](1)).single.apply() - } - } - - println("value of il: "+i1.get) -*/ - - - def disambigRes(query: Query) = { - Seq[String]() - } - - def TitlesGivenSurface = { - var m = new CounterMap[String, String]() - using(DB(ConnectionPool.borrow(this))) { db => { - db localTx { implicit session => { - SQL("select surface_text, page_title, count(*) as cnt from links inner join page on page_latest = to_id group by surface_text, page_title") - .fetchSize(5000) - .foreach(res => { - m.incrementCount(res.string("surface_text"), res.string("page_title"), res.int("cnt")) - }) - }} - }} - new WikipediaTitleGivenSurfaceDB(m) - } - - def Redirects = { - val m = new mutable.HashMap[String,String]() - using(DB(ConnectionPool.borrow(this))) { db => - db localTx { implicit session => { - SQL( - """select pf.page_title as from_page, pt.page_title as to_page - from page pf inner join links on links.from_id = pf.page_latest - inner join page pt on links.to_id = pt.page_latest - where pf.page_is_redirect = 1 limit 10000""" - ).fetchSize(5000) - .foreach(res => { - println("loading redirect "+res.string("from_page")) - m += (res.string("from_page") -> res.string("to_page")) - }) - }} - } - new WikipediaRedirectsDB(m) - } - - def Links = { - // TODO: - val ind = new Indexer[String]() - - null.asInstanceOf[WikipediaLinkDB] - } - - def Aux = { - null.asInstanceOf[WikipediaAuxDB] - } - -} - - -object WikipediaInterface_db { - - // database connection string - val conn = "jdbc:postgresql://10.7.0.17/wiki" - - // most stuff should come out of the db - val wikipediaPath = "" - - val categoryDBInputPath = "" - val categoryDBOutputPath = "" - - val outputPath = "" - - def main(args : Array[String]): Unit = { - - LightRunner.initializeOutput(WikipediaInterface_db.getClass); - LightRunner.populateScala(WikipediaInterface_db.getClass, args); - - - var db = new WikipediaInterface_db(conn) - - val catDB = if(!categoryDBInputPath.isEmpty) { - GUtil.load(categoryDBInputPath).asInstanceOf[WikipediaCategoryDB] - } else { - // this is really slow to make the cat database, you should want to avoid this - assert(false) - null.asInstanceOf[WikipediaCategoryDB] - } - - val wi = new WikipediaInterface(db.TitlesGivenSurface, db.Redirects, catDB, db.Links, db.Aux) - - GUtil.save(wi, outputPath) - - if (categoryDBOutputPath != "") { - GUtil.save(catDB, categoryDBOutputPath); - } - LightRunner.finalizeOutput(); - - // going to punt on the links db, as it appears that it is not being used - - } -} \ No newline at end of file From b577da33a8ff64fe2b92f2f641ad4b681b092ddc Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Tue, 14 Apr 2015 00:00:47 -0700 Subject: [PATCH 23/25] text db for getting some bow features from documents --- .../entity/wiki/DocumentedSetChooser.scala | 42 +++++++++ .../wiki/JointQueryDenotationChooser.scala | 4 +- .../nlp/entity/wiki/QueryChooser.scala | 25 ++++- .../nlp/entity/wiki/WikipediaInterface.scala | 9 +- .../nlp/entity/wiki/WikipediaTextDB.scala | 91 +++++++++++++++++++ 5 files changed, 162 insertions(+), 9 deletions(-) create mode 100644 src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala create mode 100644 src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala new file mode 100644 index 0000000..acd575e --- /dev/null +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/DocumentedSetChooser.scala @@ -0,0 +1,42 @@ +package edu.berkeley.nlp.entity.wiki + +import edu.berkeley.nlp.futile.LightRunner + +/** + * Created by matthewfl + * + * We want to work with the who document at a time rather then just a single link + * this will allow us to + */ +class DocumentedSetChooser { + +} + + +object DocumentedSetChooser { + + val trainDataPath = "data/ace05/train"; + val testDataPath = "data/ace05/dev"; + val wikiPath = "data/ace05/ace05-all-conll-wiki" // contains the wiki links for both items + val wikiDBPath = "models/wiki-db-ace.ser.gz" + + val lambda = 1e-8F + val batchSize = 1 + val numItrs = 20 + + val maxNumWikificationOptions = 20 //7 + + val numLoadedSamples = -1 // for debugging by loading less samples + + + def main(args: Array[String]) = { + LightRunner.initializeOutput(DocumentedSetChooser.getClass) + LightRunner.populateScala(DocumentedSetChooser.getClass, args) + + // load the documents + + + + LightRunner.finalizeOutput() + } +} \ No newline at end of file diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index 9b206d5..d887378 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -284,7 +284,9 @@ object JointQueryDenotationChooser { // Make training examples, filtering out those with solutions that are unreachable because // they're not good for training val trainExs = extractExamples(trainCorefDocs, goldWikification, wikiDB, filterImpossible = true) - + + // going to have make this system work on a set of a document + // Extract features val featIndexer = new Indexer[String] val computer = new JointQueryDenotationChoiceComputer(wikiDB, featIndexer); diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala index de02806..f800213 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala @@ -167,7 +167,7 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface, ret } - def NGD[T](a: Set[T], b: Set[T], wsize: Int) : Double = { + /*def NGD[T](a: Set[T], b: Set[T], wsize: Int) : Double = { (logs(math.max(a.size, b.size)) - logs(intersectSize(a,b))) / (logs(wsize) - logs(math.min(a.size,b.size))) } @@ -227,7 +227,7 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface, r += prefix + "isAboveAvg2" r.toArray } - } + }*/ def featurizeQueriesAndDenotations_GLOW(queries: Seq[Query], denotations: Seq[String], addToIndexer: Boolean, wikiDB: WikipediaInterface): Array[Array[Array[Int]]] = { val queryOutcomes = queries.map(query => wikiDB.disambiguateBestGetAllOptions(query)); @@ -236,21 +236,36 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface, val mentUpToHeadSize = ment.headIdx - ment.startIdx + 1; val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB) - val PMINGDvals = Seq( + /*val PMINGDvals = Seq( GLOWfeatures[Int](PMI, refLinksIn, "PMI-in-"), GLOWfeatures[Int](NGD, refLinksIn, "NGD-in-"), GLOWfeatures[Int](PMI, refLinksOut, "PMI-out-"), GLOWfeatures[Int](NGD, refLinksOut, "NGD-out-") - ) + )*/ + // TODO: this is not correct,..... + // we need to know what we are going to annonate stuff in the document with, + // these are going to be denotations for a single example, which won't be useful + // so we need to get all the possible annontations for a given document + // + // in the wikification paper they have something that is choosing the references together + // need to look at pairs of references and + + + + + // TODO: implement the local vector features which compare the text of the pages + // the context can be the set of items linking into/outof a page? but then that isn't the similarity + Array.tabulate(queries.size, denotations.size)((queryIdx, denIdx) => { val feats = new ArrayBuffer[Int]; def feat(str: String) = addFeat(str, feats, addToIndexer); - for(p <- PMINGDvals) + /*for(p <- PMINGDvals) for(f <- p(denIdx)) feat(f) + */ val query = queries(queryIdx); val den = denotations(denIdx); if (den == NilToken) { diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala index 223cabf..d87fbfe 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaInterface.scala @@ -56,7 +56,8 @@ class WikipediaInterface(val titleGivenSurfaceDB: WikipediaTitleGivenSurfaceDB, val redirectsDB: WikipediaRedirectsDB, val categoryDB: WikipediaCategoryDB, val linksDB: WikipediaLinkDB, - val auxDB: WikipediaAuxDB) extends Serializable { + val auxDB: WikipediaAuxDB, + val textDB: WikipediaTextDB) extends Serializable { def getStandardPriorForJointModel(ment: Mention) = { val counter = new Counter[String]; @@ -207,7 +208,8 @@ object WikipediaInterface { } val categories = WikipediaCategoryDB.processWikipedia(wikipediaPath, allPageTargetsLc, parser, backoffParser); val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc); - val wi = new WikipediaInterface(titleGivenSurface, redirects, categories, links, aux); + val texts = WikipediaTextDB.processWikipedia(wikipediaPath, allPageTargetsLc); + val wi = new WikipediaInterface(titleGivenSurface, redirects, categories, links, aux, texts); wi.printSome(); wi; } @@ -222,7 +224,8 @@ object WikipediaInterface { new WikipediaLinkDB(new Indexer[String], new HashMap[Int,Array[Int]], new HashMap[Int,Array[Int]]); } val aux = WikipediaAuxDB.processWikipedia(wikipediaPath, allPageTargetsLc); - val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux); + val texts = WikipediaTextDB.processWikipedia(wikipediaPath, allPageTargetsLc); + val wi = new WikipediaInterface(titleGivenSurface, redirects, categoryDB, links, aux, texts); wi.printSome(); wi; } diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala new file mode 100644 index 0000000..66cb11f --- /dev/null +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala @@ -0,0 +1,91 @@ +package edu.berkeley.nlp.entity.wiki + +import edu.berkeley.nlp.futile.fig.basic.{IOUtils, Indexer} +import edu.berkeley.nlp.futile.util.Counter + +import scala.collection.JavaConversions._ + + +import scala.StringBuilder +import scala.collection.mutable + +/** + * Created by matthewfl + * + * Provide proxy bow counts for documents so we can compute the similarity between two documents + */ +@SerialVersionUID(1L) +class WikipediaTextDB (val indexer: Indexer[String], val words: mutable.HashMap[String, Counter[Int]]) { + + +} + +object WikipediaTextDB { + def processWikipedia(wikipediaPath:String, querySet: Set[String]) : WikipediaTextDB = { + val lines = IOUtils.lineIterator(IOUtils.openInHard(wikipediaPath)); + var currentPageTitle: String = null + val indexer = new Indexer[String] + val totalWordCounts = new Counter[Int] + var currentWordCounts = new Counter[Int] + val documentResults = new mutable.HashMap[String,Counter[Int]] + var lineIdx = 0 + var numPagesSeen = 0 + var doneWithThisPage = false + + while(lines.hasNext) { + val line = lines.next + if (lineIdx % 100000 == 0) { + println("Line: " + lineIdx + ", processed " + numPagesSeen + " pages"); + } + lineIdx += 1; + if (line.size > 8 && doneWithThisPage) { + // Do nothing + } else { + if(line.contains("")) { + doneWithThisPage = false + numPagesSeen += 1 + } else if (line.contains("")) { + // 7 = "<title>".length() + currentPageTitle = line.substring(line.indexOf("<title>") + 7, line.indexOf("")); + if (!querySet.contains(currentPageTitle.toLowerCase)) { + doneWithThisPage = true; + } else { + currentWordCounts = new Counter[Int]() + documentResults += (currentPageTitle -> currentWordCounts) + } + } else if(line.contains("") + 1 + var document = new StringBuilder() + var textEnd = line.indexOf("") + if(textEnd != -1) { + document.append(line.substring(textStart, textEnd)) + } else { + var curLine = line.substring(textStart) + while(textEnd == -1) { + document.append(curLine) + curLine = lines.next + textEnd = curLine.indexOf("") + } + document.append(curLine.substring(0, textEnd)) + } + // TODO: maybe toSet + document.toString.split("[^A-Za-z]").foreach(w => { + val i = indexer.getIndex(w) + totalWordCounts.incrementCount(i, 1.0) + currentWordCounts.incrementCount(i, 1.0) + }) + } + } + } + + // get the 300 most common words and remove them from all the documents + val wrdsq = totalWordCounts.asPriorityQueue + val removeWords = new mutable.HashSet[Int]() + for(i <- 0 until 300; if wrdsq.hasNext) + removeWords += wrdsq.next + documentResults.foreach(_._2.prune(removeWords)) + + + new WikipediaTextDB(indexer, documentResults) + } +} From 92b5173c537d044a9657ccd188c485333c8b5692 Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Tue, 14 Apr 2015 11:53:09 -0700 Subject: [PATCH 24/25] some bug fixes and reduce memory pressure --- .../nlp/entity/wiki/WikipediaTextDB.scala | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala index 66cb11f..5aef776 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala @@ -12,10 +12,10 @@ import scala.collection.mutable /** * Created by matthewfl * - * Provide proxy bow counts for documents so we can compute the similarity between two documents + * Provide bow counts for documents so we can compute the similarity between two documents */ @SerialVersionUID(1L) -class WikipediaTextDB (val indexer: Indexer[String], val words: mutable.HashMap[String, Counter[Int]]) { +class WikipediaTextDB (val indexer: Indexer[String], val words: mutable.HashMap[String, Array[Int]]) extends Serializable { } @@ -26,8 +26,8 @@ object WikipediaTextDB { var currentPageTitle: String = null val indexer = new Indexer[String] val totalWordCounts = new Counter[Int] - var currentWordCounts = new Counter[Int] - val documentResults = new mutable.HashMap[String,Counter[Int]] + var currentWordCounts = new mutable.HashSet[Int] + val documentResults = new mutable.HashMap[String,Array[Int]] var lineIdx = 0 var numPagesSeen = 0 var doneWithThisPage = false @@ -46,16 +46,19 @@ object WikipediaTextDB { numPagesSeen += 1 } else if (line.contains("")) { // 7 = "<title>".length() - currentPageTitle = line.substring(line.indexOf("<title>") + 7, line.indexOf("")); - if (!querySet.contains(currentPageTitle.toLowerCase)) { + val newPageTitle = line.substring(line.indexOf("") + 7, line.indexOf("")); + if (!querySet.contains(newPageTitle.toLowerCase)) { doneWithThisPage = true; } else { - currentWordCounts = new Counter[Int]() - documentResults += (currentPageTitle -> currentWordCounts) + if(currentPageTitle != null) { + documentResults += (currentPageTitle -> currentWordCounts.toArray) + } + currentWordCounts = new mutable.HashSet[Int]() + currentPageTitle = newPageTitle } } else if(line.contains("") + 1 - var document = new StringBuilder() + val document = new StringBuilder() var textEnd = line.indexOf("") if(textEnd != -1) { document.append(line.substring(textStart, textEnd)) @@ -72,7 +75,8 @@ object WikipediaTextDB { document.toString.split("[^A-Za-z]").foreach(w => { val i = indexer.getIndex(w) totalWordCounts.incrementCount(i, 1.0) - currentWordCounts.incrementCount(i, 1.0) + currentWordCounts += i + //currentWordCounts.incrementCount(i, 1.0) }) } } @@ -83,7 +87,9 @@ object WikipediaTextDB { val removeWords = new mutable.HashSet[Int]() for(i <- 0 until 300; if wrdsq.hasNext) removeWords += wrdsq.next - documentResults.foreach(_._2.prune(removeWords)) + for(k <- documentResults) { + documentResults(k._1) = k._2.filter(!removeWords.contains(_)).sorted + } new WikipediaTextDB(indexer, documentResults) From 1a5e9454d21e285084c5e338b7153cb3e8b86e9d Mon Sep 17 00:00:00 2001 From: Matthew Francis-Landau Date: Sat, 18 Apr 2015 17:20:14 -0700 Subject: [PATCH 25/25] fixes for document word vectors --- .../edu/berkeley/nlp/entity/Document.scala | 2 ++ .../wiki/JointQueryDenotationChooser.scala | 4 +++ .../nlp/entity/wiki/QueryChooser.scala | 13 ++++--- .../nlp/entity/wiki/WikipediaTextDB.scala | 34 ++++++++++++++++++- 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/berkeley/nlp/entity/Document.scala b/src/main/java/edu/berkeley/nlp/entity/Document.scala index 44555df..cf95766 100644 --- a/src/main/java/edu/berkeley/nlp/entity/Document.scala +++ b/src/main/java/edu/berkeley/nlp/entity/Document.scala @@ -31,4 +31,6 @@ trait Document { def isConversation : Boolean = false def getCorrespondingNERChunk (sentIdx : Int, headIdx : Int) : Option[Chunk[String]] + + var documentVectorCache: Array[Int] = null } diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala index d887378..b145732 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/JointQueryDenotationChooser.scala @@ -29,6 +29,8 @@ case class JointQueryDenotationExample(val queries: Seq[Query], // Feature caches since feature computation is expensive if redone every time var cachedFeatsEachQuery: Array[Array[Int]] = null; var cachedFeatsEachQueryDenotation: Array[Array[Array[Int]]] = null; + + def document = queries.head.originalMent.rawDoc } /** @@ -43,6 +45,8 @@ class JointQueryDenotationChoiceComputer(val wikiDB: WikipediaInterface, def featurizeUseCache(ex: JointQueryDenotationExample, addToIndexer: Boolean) { if (ex.cachedFeatsEachQuery == null) { + if(ex.document.documentVectorCache == null) + ex.document.documentVectorCache = wikiDB.textDB.makeVector(ex.document.words) ex.cachedFeatsEachQuery = queryChooser.featurizeQueries(ex.queries, addToIndexer) ex.cachedFeatsEachQueryDenotation = queryChooser.featurizeQueriesAndDenotations_GLOW(ex.queries, ex.allDenotations, addToIndexer, wikiDB) } diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala index f800213..712ad3d 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/QueryChooser.scala @@ -234,9 +234,9 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface, val queryNonemptyList = queryOutcomes.map(_.isEmpty); val ment = queries.head.originalMent; val mentUpToHeadSize = ment.headIdx - ment.startIdx + 1; - val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB) + /*val (refLinksIn, refLinksOut) = getDentationLinksSets(denotations, wikiDB) - /*val PMINGDvals = Seq( + val PMINGDvals = Seq( GLOWfeatures[Int](PMI, refLinksIn, "PMI-in-"), GLOWfeatures[Int](NGD, refLinksIn, "NGD-in-"), GLOWfeatures[Int](PMI, refLinksOut, "PMI-out-"), @@ -251,8 +251,9 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface, // in the wikification paper they have something that is choosing the references together // need to look at pairs of references and - - + val denotationSim = denotations.map(t => wikiDB.textDB.compareDocumentC(ment.rawDoc.documentVectorCache, t)) + val denotationSimMax = denotationSim.max + val denotationSimAvg = denotationSim.sum / denotationSim.size // TODO: implement the local vector features which compare the text of the pages // the context can be the set of items linking into/outof a page? but then that isn't the similarity @@ -287,6 +288,10 @@ class QueryChoiceComputer(val wikiDB: WikipediaInterface, if (denotationHasParenthetical) { feat("MatchesQueryUpToParen=" + queryDescriptorWithProper + "-" + (den.substring(0, den.indexOf("(")).trim.toLowerCase == queryStr.toLowerCase)) } + feat("CompariableWordsLog="+Math.floor(Math.log(denotationSim(denIdx)))) + feat("CompariableIsMaxWordSim=" + (denotationSim(denIdx) == denotationSimMax)) + feat("CompariableWordsAboveAvg=" + (denotationSim(denIdx) > denotationSimAvg)) + feat("CompariableWordsReweight="+Math.floor(denotationSim(denIdx) / denotationSimMax * 10)) } else { feat("Impossible"); } diff --git a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala index 5aef776..aca157a 100644 --- a/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala +++ b/src/main/java/edu/berkeley/nlp/entity/wiki/WikipediaTextDB.scala @@ -17,6 +17,38 @@ import scala.collection.mutable @SerialVersionUID(1L) class WikipediaTextDB (val indexer: Indexer[String], val words: mutable.HashMap[String, Array[Int]]) extends Serializable { + def getDocument(title: String) = words.getOrElse(title, Array[Int]()) + + def compareVectors(a: Array[Int], b: Array[Int]) = { + var ai = 0 + var bi = 0 + var simcnt = 0 + while(ai < a.size && bi < b.size) { + if(a(ai) == b(bi)) { + simcnt += 1 + ai += 1 + bi += 1 + } else if(a(ai) > b(bi)) { + bi += 1 + } else { + ai += 1 + } + } + simcnt + } + + def compareTitles(atitle: String, btitle: String) = compareVectors(getDocument(atitle), getDocument(btitle)) + + def makeVector(document: Seq[Seq[String]]) = { + document.flatMap(_.map(v => indexer.indexOf(v.toLowerCase))).toSet.filter(_ != -1).toArray.sorted + } + + def compareDocument(doc: Array[Int], title: String) = compareVectors(doc, getDocument(title)) + + def compareDocumentC(doc: Array[Int], title: String) = { + val tdoc = getDocument(title) + compareVectors(doc, tdoc).asInstanceOf[Double] / (doc.size * tdoc.size) + } } @@ -73,7 +105,7 @@ object WikipediaTextDB { } // TODO: maybe toSet document.toString.split("[^A-Za-z]").foreach(w => { - val i = indexer.getIndex(w) + val i = indexer.getIndex(w.toLowerCase) totalWordCounts.incrementCount(i, 1.0) currentWordCounts += i //currentWordCounts.incrementCount(i, 1.0)