Use collective's addressbook to configure regexner

This commit is contained in:
Eike Kettner 2020-08-24 14:35:56 +02:00
parent 8628a0a8b3
commit 96d2f948f2
4 changed files with 84 additions and 9 deletions

View File

@ -3,12 +3,17 @@ package docspell.analysis.nlp
import minitest.SimpleTestSuite
import docspell.files.TestFiles
import docspell.common._
import edu.stanford.nlp.pipeline.StanfordCoreNLP
object TextAnalyserSuite extends SimpleTestSuite {
lazy val germanClassifier =
new StanfordCoreNLP(Properties.nerGerman(None, false))
lazy val englishClassifier =
new StanfordCoreNLP(Properties.nerEnglish(None))
test("find english ner labels") {
val labels =
StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
val expect = Vector(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
@ -44,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
test("find german ner labels") {
val labels =
StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
val expect = Vector(
NerLabel("Max", NerTag.Person, 0, 3),
NerLabel("Mustermann", NerTag.Person, 4, 14),

View File

@ -1,6 +1,7 @@
package docspell.common
import java.io.IOException
import java.nio.charset.StandardCharsets
import java.nio.file._
import java.nio.file.attribute.BasicFileAttributes
import java.util.concurrent.atomic.AtomicInteger
@ -87,4 +88,7 @@ object File {
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
}

View File

@ -1,13 +1,18 @@
package docspell.joex.process
import java.nio.file.Paths
import cats.effect._
import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.StanfordSettings
import docspell.analysis.split.TextSplitter
import docspell.common._
import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.queries.QCollective
import docspell.store.records.RAttachmentMeta
object TextAnalysis {
@ -22,7 +27,7 @@ object TextAnalysis {
t <-
item.metas.toList
.traverse(
annotateAttachment[F](ctx.args, ctx.logger, analyser)
annotateAttachment[F](ctx, analyser)
)
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
_ <- t.traverse(m =>
@ -35,18 +40,64 @@ object TextAnalysis {
}
def annotateAttachment[F[_]: Sync](
args: ProcessItemArgs,
logger: Logger[F],
ctx: Context[F, ProcessItemArgs],
analyser: TextAnalyser[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
val settings = StanfordSettings(args.meta.language, false, None)
val settings = StanfordSettings(ctx.args.meta.language, false, None)
for {
names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
temp <- File.mkTempFile(Paths.get("."), "textanalysis")
_ <- File.writeString(temp, mkNerConfig(names))
sett = settings.copy(regexNer = Some(temp))
labels <- analyser.annotate(
logger,
settings,
args.meta.collective,
ctx.logger,
sett,
ctx.args.meta.collective,
rm.content.getOrElse("")
)
_ <- File.deleteFile(temp)
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
}
def mkNerConfig(names: QCollective.Names): String = {
val orgs = names.org
.flatMap(Pattern(3))
.distinct
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
val pers =
names.pers
.flatMap(Pattern(2))
.distinct
.map(_.toRow("PERSON", "LOCATION,MISC"))
val equips =
names.equip
.flatMap(Pattern(1))
.distinct
.map(_.toRow("MISC", "LOCATION"))
(orgs ++ pers ++ equips).mkString("\n")
}
case class Pattern(value: String, weight: Int) {
def toRow(tag: String, overrideTags: String): String =
s"$value\t$tag\t$overrideTags\t$weight"
}
object Pattern {
def apply(weight: Int)(str: String): Vector[Pattern] = {
val delims = " \t\n\r".toSet
val words =
TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
val tokens =
TextSplitter
.splitToken(str, delims)
.toVector
.take(3)
.map(w => s"(?i)${w.toLower.value}")
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
}
}
}

View File

@ -1,5 +1,6 @@
package docspell.store.queries
import cats.data.OptionT
import fs2.Stream
import docspell.common.ContactKind
@ -11,6 +12,20 @@ import doobie._
import doobie.implicits._
object QCollective {
case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
object Names {
val empty = Names(Vector.empty, Vector.empty, Vector.empty)
}
def allNames(collective: Ident): ConnectionIO[Names] =
(for {
orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
} yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
.getOrElse(Names.empty)
case class TagCount(tag: RTag, count: Int)
case class InsightData(