Use collective's addressbook to configure regexner

This commit is contained in:
Eike Kettner
2020-08-24 14:35:56 +02:00
parent 8628a0a8b3
commit 96d2f948f2
4 changed files with 84 additions and 9 deletions

View File

@ -1,13 +1,18 @@
package docspell.joex.process
import java.nio.file.Paths
import cats.effect._
import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.StanfordSettings
import docspell.analysis.split.TextSplitter
import docspell.common._
import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.queries.QCollective
import docspell.store.records.RAttachmentMeta
object TextAnalysis {
@ -22,7 +27,7 @@ object TextAnalysis {
t <-
item.metas.toList
.traverse(
annotateAttachment[F](ctx.args, ctx.logger, analyser)
annotateAttachment[F](ctx, analyser)
)
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
_ <- t.traverse(m =>
@ -35,18 +40,64 @@ object TextAnalysis {
}
def annotateAttachment[F[_]: Sync](
args: ProcessItemArgs,
logger: Logger[F],
ctx: Context[F, ProcessItemArgs],
analyser: TextAnalyser[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
val settings = StanfordSettings(args.meta.language, false, None)
val settings = StanfordSettings(ctx.args.meta.language, false, None)
for {
names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
temp <- File.mkTempFile(Paths.get("."), "textanalysis")
_ <- File.writeString(temp, mkNerConfig(names))
sett = settings.copy(regexNer = Some(temp))
labels <- analyser.annotate(
logger,
settings,
args.meta.collective,
ctx.logger,
sett,
ctx.args.meta.collective,
rm.content.getOrElse("")
)
_ <- File.deleteFile(temp)
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
}
def mkNerConfig(names: QCollective.Names): String = {
val orgs = names.org
.flatMap(Pattern(3))
.distinct
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
val pers =
names.pers
.flatMap(Pattern(2))
.distinct
.map(_.toRow("PERSON", "LOCATION,MISC"))
val equips =
names.equip
.flatMap(Pattern(1))
.distinct
.map(_.toRow("MISC", "LOCATION"))
(orgs ++ pers ++ equips).mkString("\n")
}
case class Pattern(value: String, weight: Int) {
def toRow(tag: String, overrideTags: String): String =
s"$value\t$tag\t$overrideTags\t$weight"
}
object Pattern {
def apply(weight: Int)(str: String): Vector[Pattern] = {
val delims = " \t\n\r".toSet
val words =
TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
val tokens =
TextSplitter
.splitToken(str, delims)
.toVector
.take(3)
.map(w => s"(?i)${w.toLower.value}")
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
}
}
}