From 96d2f948f2af5a0e11859fc1101ced9de36d862b Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 24 Aug 2020 14:35:56 +0200 Subject: [PATCH] Use collective's addressbook to configure regexner --- .../analysis/nlp/TextAnalyserSuite.scala | 9 ++- .../src/main/scala/docspell/common/File.scala | 4 ++ .../docspell/joex/process/TextAnalysis.scala | 65 +++++++++++++++++-- .../docspell/store/queries/QCollective.scala | 15 +++++ 4 files changed, 84 insertions(+), 9 deletions(-) diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala index b7c083a1..b22093f1 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala @@ -3,12 +3,17 @@ package docspell.analysis.nlp import minitest.SimpleTestSuite import docspell.files.TestFiles import docspell.common._ +import edu.stanford.nlp.pipeline.StanfordCoreNLP object TextAnalyserSuite extends SimpleTestSuite { + lazy val germanClassifier = + new StanfordCoreNLP(Properties.nerGerman(None, false)) + lazy val englishClassifier = + new StanfordCoreNLP(Properties.nerEnglish(None)) test("find english ner labels") { val labels = - StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText) + StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText) val expect = Vector( NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Jeter", NerTag.Person, 6, 11), @@ -44,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite { test("find german ner labels") { val labels = - StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText) + StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText) val expect = Vector( NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Mustermann", NerTag.Person, 4, 14), diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala index 0efc552a..2d5cfb8a 100644 --- a/modules/common/src/main/scala/docspell/common/File.scala +++ b/modules/common/src/main/scala/docspell/common/File.scala @@ -1,6 +1,7 @@ package docspell.common import java.io.IOException +import java.nio.charset.StandardCharsets import java.nio.file._ import java.nio.file.attribute.BasicFileAttributes import java.util.concurrent.atomic.AtomicInteger @@ -87,4 +88,7 @@ object File { def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] = readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid + + def writeString[F[_]: Sync](file: Path, content: String): F[Path] = + Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8))) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 625738ef..9ee3850c 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,13 +1,18 @@ package docspell.joex.process +import java.nio.file.Paths + import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser import docspell.analysis.nlp.StanfordSettings +import docspell.analysis.split.TextSplitter import docspell.common._ import docspell.joex.process.ItemData.AttachmentDates +import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task +import docspell.store.queries.QCollective import docspell.store.records.RAttachmentMeta object TextAnalysis { @@ -22,7 +27,7 @@ object TextAnalysis { t <- item.metas.toList .traverse( - annotateAttachment[F](ctx.args, ctx.logger, analyser) + annotateAttachment[F](ctx, analyser) ) _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") _ <- t.traverse(m => @@ -35,18 +40,64 @@ object TextAnalysis { } def annotateAttachment[F[_]: Sync]( - args: ProcessItemArgs, - logger: Logger[F], + ctx: Context[F, ProcessItemArgs], analyser: TextAnalyser[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { - val settings = StanfordSettings(args.meta.language, false, None) + val settings = StanfordSettings(ctx.args.meta.language, false, None) for { + names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective)) + temp <- File.mkTempFile(Paths.get("."), "textanalysis") + _ <- File.writeString(temp, mkNerConfig(names)) + sett = settings.copy(regexNer = Some(temp)) labels <- analyser.annotate( - logger, - settings, - args.meta.collective, + ctx.logger, + sett, + ctx.args.meta.collective, rm.content.getOrElse("") ) + _ <- File.deleteFile(temp) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } + + def mkNerConfig(names: QCollective.Names): String = { + val orgs = names.org + .flatMap(Pattern(3)) + .distinct + .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC")) + + val pers = + names.pers + .flatMap(Pattern(2)) + .distinct + .map(_.toRow("PERSON", "LOCATION,MISC")) + + val equips = + names.equip + .flatMap(Pattern(1)) + .distinct + .map(_.toRow("MISC", "LOCATION")) + + (orgs ++ pers ++ equips).mkString("\n") + } + + case class Pattern(value: String, weight: Int) { + def toRow(tag: String, overrideTags: String): String = + s"$value\t$tag\t$overrideTags\t$weight" + } + + object Pattern { + def apply(weight: Int)(str: String): Vector[Pattern] = { + val delims = " \t\n\r".toSet + val words = + TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}") + val tokens = + TextSplitter + .splitToken(str, delims) + .toVector + .take(3) + .map(w => s"(?i)${w.toLower.value}") + + tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight)) + } + } } diff --git a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala index 2dc94e05..80b40207 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala @@ -1,5 +1,6 @@ package docspell.store.queries +import cats.data.OptionT import fs2.Stream import docspell.common.ContactKind @@ -11,6 +12,20 @@ import doobie._ import doobie.implicits._ object QCollective { + + case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String]) + object Names { + val empty = Names(Vector.empty, Vector.empty, Vector.empty) + } + + def allNames(collective: Ident): ConnectionIO[Names] = + (for { + orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name)) + pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name)) + equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name)) + } yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name))) + .getOrElse(Names.empty) + case class TagCount(tag: RTag, count: Int) case class InsightData(