Use collective's addressbook to configure regexner

This commit is contained in:
Eike Kettner 2020-08-24 14:35:56 +02:00
parent 8628a0a8b3
commit 96d2f948f2
4 changed files with 84 additions and 9 deletions

View File

@ -3,12 +3,17 @@ package docspell.analysis.nlp
import minitest.SimpleTestSuite import minitest.SimpleTestSuite
import docspell.files.TestFiles import docspell.files.TestFiles
import docspell.common._ import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
object TextAnalyserSuite extends SimpleTestSuite { object TextAnalyserSuite extends SimpleTestSuite {
lazy val germanClassifier =
new StanfordCoreNLP(Properties.nerGerman(None, false))
lazy val englishClassifier =
new StanfordCoreNLP(Properties.nerEnglish(None))
test("find english ner labels") { test("find english ner labels") {
val labels = val labels =
StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText) StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
val expect = Vector( val expect = Vector(
NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11), NerLabel("Jeter", NerTag.Person, 6, 11),
@ -44,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
test("find german ner labels") { test("find german ner labels") {
val labels = val labels =
StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText) StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
val expect = Vector( val expect = Vector(
NerLabel("Max", NerTag.Person, 0, 3), NerLabel("Max", NerTag.Person, 0, 3),
NerLabel("Mustermann", NerTag.Person, 4, 14), NerLabel("Mustermann", NerTag.Person, 4, 14),

View File

@ -1,6 +1,7 @@
package docspell.common package docspell.common
import java.io.IOException import java.io.IOException
import java.nio.charset.StandardCharsets
import java.nio.file._ import java.nio.file._
import java.nio.file.attribute.BasicFileAttributes import java.nio.file.attribute.BasicFileAttributes
import java.util.concurrent.atomic.AtomicInteger import java.util.concurrent.atomic.AtomicInteger
@ -87,4 +88,7 @@ object File {
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] = def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
} }

View File

@ -1,13 +1,18 @@
package docspell.joex.process package docspell.joex.process
import java.nio.file.Paths
import cats.effect._ import cats.effect._
import cats.implicits._ import cats.implicits._
import docspell.analysis.TextAnalyser import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.StanfordSettings import docspell.analysis.nlp.StanfordSettings
import docspell.analysis.split.TextSplitter
import docspell.common._ import docspell.common._
import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.store.queries.QCollective
import docspell.store.records.RAttachmentMeta import docspell.store.records.RAttachmentMeta
object TextAnalysis { object TextAnalysis {
@ -22,7 +27,7 @@ object TextAnalysis {
t <- t <-
item.metas.toList item.metas.toList
.traverse( .traverse(
annotateAttachment[F](ctx.args, ctx.logger, analyser) annotateAttachment[F](ctx, analyser)
) )
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
_ <- t.traverse(m => _ <- t.traverse(m =>
@ -35,18 +40,64 @@ object TextAnalysis {
} }
def annotateAttachment[F[_]: Sync]( def annotateAttachment[F[_]: Sync](
args: ProcessItemArgs, ctx: Context[F, ProcessItemArgs],
logger: Logger[F],
analyser: TextAnalyser[F] analyser: TextAnalyser[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
val settings = StanfordSettings(args.meta.language, false, None) val settings = StanfordSettings(ctx.args.meta.language, false, None)
for { for {
names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
temp <- File.mkTempFile(Paths.get("."), "textanalysis")
_ <- File.writeString(temp, mkNerConfig(names))
sett = settings.copy(regexNer = Some(temp))
labels <- analyser.annotate( labels <- analyser.annotate(
logger, ctx.logger,
settings, sett,
args.meta.collective, ctx.args.meta.collective,
rm.content.getOrElse("") rm.content.getOrElse("")
) )
_ <- File.deleteFile(temp)
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
} }
def mkNerConfig(names: QCollective.Names): String = {
val orgs = names.org
.flatMap(Pattern(3))
.distinct
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
val pers =
names.pers
.flatMap(Pattern(2))
.distinct
.map(_.toRow("PERSON", "LOCATION,MISC"))
val equips =
names.equip
.flatMap(Pattern(1))
.distinct
.map(_.toRow("MISC", "LOCATION"))
(orgs ++ pers ++ equips).mkString("\n")
}
case class Pattern(value: String, weight: Int) {
def toRow(tag: String, overrideTags: String): String =
s"$value\t$tag\t$overrideTags\t$weight"
}
object Pattern {
def apply(weight: Int)(str: String): Vector[Pattern] = {
val delims = " \t\n\r".toSet
val words =
TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
val tokens =
TextSplitter
.splitToken(str, delims)
.toVector
.take(3)
.map(w => s"(?i)${w.toLower.value}")
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
}
}
} }

View File

@ -1,5 +1,6 @@
package docspell.store.queries package docspell.store.queries
import cats.data.OptionT
import fs2.Stream import fs2.Stream
import docspell.common.ContactKind import docspell.common.ContactKind
@ -11,6 +12,20 @@ import doobie._
import doobie.implicits._ import doobie.implicits._
object QCollective { object QCollective {
case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
object Names {
val empty = Names(Vector.empty, Vector.empty, Vector.empty)
}
def allNames(collective: Ident): ConnectionIO[Names] =
(for {
orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
} yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
.getOrElse(Names.empty)
case class TagCount(tag: RTag, count: Int) case class TagCount(tag: RTag, count: Int)
case class InsightData( case class InsightData(