mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-02 21:42:52 +00:00
Use collective's addressbook to configure regexner
This commit is contained in:
parent
8628a0a8b3
commit
96d2f948f2
@ -3,12 +3,17 @@ package docspell.analysis.nlp
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.files.TestFiles
|
||||
import docspell.common._
|
||||
import edu.stanford.nlp.pipeline.StanfordCoreNLP
|
||||
|
||||
object TextAnalyserSuite extends SimpleTestSuite {
|
||||
lazy val germanClassifier =
|
||||
new StanfordCoreNLP(Properties.nerGerman(None, false))
|
||||
lazy val englishClassifier =
|
||||
new StanfordCoreNLP(Properties.nerEnglish(None))
|
||||
|
||||
test("find english ner labels") {
|
||||
val labels =
|
||||
StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
|
||||
StanfordNerClassifier.runClassifier(englishClassifier, TestFiles.letterENText)
|
||||
val expect = Vector(
|
||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||
@ -44,7 +49,7 @@ object TextAnalyserSuite extends SimpleTestSuite {
|
||||
|
||||
test("find german ner labels") {
|
||||
val labels =
|
||||
StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
|
||||
StanfordNerClassifier.runClassifier(germanClassifier, TestFiles.letterDEText)
|
||||
val expect = Vector(
|
||||
NerLabel("Max", NerTag.Person, 0, 3),
|
||||
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
||||
|
@ -1,6 +1,7 @@
|
||||
package docspell.common
|
||||
|
||||
import java.io.IOException
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.nio.file._
|
||||
import java.nio.file.attribute.BasicFileAttributes
|
||||
import java.util.concurrent.atomic.AtomicInteger
|
||||
@ -87,4 +88,7 @@ object File {
|
||||
|
||||
def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] =
|
||||
readAll[F](file, blocker, 8192).through(fs2.text.utf8Decode).compile.foldMonoid
|
||||
|
||||
def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
|
||||
Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
|
||||
}
|
||||
|
@ -1,13 +1,18 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import java.nio.file.Paths
|
||||
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.analysis.nlp.StanfordSettings
|
||||
import docspell.analysis.split.TextSplitter
|
||||
import docspell.common._
|
||||
import docspell.joex.process.ItemData.AttachmentDates
|
||||
import docspell.joex.scheduler.Context
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.queries.QCollective
|
||||
import docspell.store.records.RAttachmentMeta
|
||||
|
||||
object TextAnalysis {
|
||||
@ -22,7 +27,7 @@ object TextAnalysis {
|
||||
t <-
|
||||
item.metas.toList
|
||||
.traverse(
|
||||
annotateAttachment[F](ctx.args, ctx.logger, analyser)
|
||||
annotateAttachment[F](ctx, analyser)
|
||||
)
|
||||
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
|
||||
_ <- t.traverse(m =>
|
||||
@ -35,18 +40,64 @@ object TextAnalysis {
|
||||
}
|
||||
|
||||
def annotateAttachment[F[_]: Sync](
|
||||
args: ProcessItemArgs,
|
||||
logger: Logger[F],
|
||||
ctx: Context[F, ProcessItemArgs],
|
||||
analyser: TextAnalyser[F]
|
||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||
val settings = StanfordSettings(args.meta.language, false, None)
|
||||
val settings = StanfordSettings(ctx.args.meta.language, false, None)
|
||||
for {
|
||||
names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
|
||||
temp <- File.mkTempFile(Paths.get("."), "textanalysis")
|
||||
_ <- File.writeString(temp, mkNerConfig(names))
|
||||
sett = settings.copy(regexNer = Some(temp))
|
||||
labels <- analyser.annotate(
|
||||
logger,
|
||||
settings,
|
||||
args.meta.collective,
|
||||
ctx.logger,
|
||||
sett,
|
||||
ctx.args.meta.collective,
|
||||
rm.content.getOrElse("")
|
||||
)
|
||||
_ <- File.deleteFile(temp)
|
||||
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||
}
|
||||
|
||||
def mkNerConfig(names: QCollective.Names): String = {
|
||||
val orgs = names.org
|
||||
.flatMap(Pattern(3))
|
||||
.distinct
|
||||
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
|
||||
|
||||
val pers =
|
||||
names.pers
|
||||
.flatMap(Pattern(2))
|
||||
.distinct
|
||||
.map(_.toRow("PERSON", "LOCATION,MISC"))
|
||||
|
||||
val equips =
|
||||
names.equip
|
||||
.flatMap(Pattern(1))
|
||||
.distinct
|
||||
.map(_.toRow("MISC", "LOCATION"))
|
||||
|
||||
(orgs ++ pers ++ equips).mkString("\n")
|
||||
}
|
||||
|
||||
case class Pattern(value: String, weight: Int) {
|
||||
def toRow(tag: String, overrideTags: String): String =
|
||||
s"$value\t$tag\t$overrideTags\t$weight"
|
||||
}
|
||||
|
||||
object Pattern {
|
||||
def apply(weight: Int)(str: String): Vector[Pattern] = {
|
||||
val delims = " \t\n\r".toSet
|
||||
val words =
|
||||
TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
|
||||
val tokens =
|
||||
TextSplitter
|
||||
.splitToken(str, delims)
|
||||
.toVector
|
||||
.take(3)
|
||||
.map(w => s"(?i)${w.toLower.value}")
|
||||
|
||||
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package docspell.store.queries
|
||||
|
||||
import cats.data.OptionT
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common.ContactKind
|
||||
@ -11,6 +12,20 @@ import doobie._
|
||||
import doobie.implicits._
|
||||
|
||||
object QCollective {
|
||||
|
||||
case class Names(org: Vector[String], pers: Vector[String], equip: Vector[String])
|
||||
object Names {
|
||||
val empty = Names(Vector.empty, Vector.empty, Vector.empty)
|
||||
}
|
||||
|
||||
def allNames(collective: Ident): ConnectionIO[Names] =
|
||||
(for {
|
||||
orgs <- OptionT.liftF(ROrganization.findAllRef(collective, None, _.name))
|
||||
pers <- OptionT.liftF(RPerson.findAllRef(collective, None, _.name))
|
||||
equp <- OptionT.liftF(REquipment.findAll(collective, None, _.name))
|
||||
} yield Names(orgs.map(_.name), pers.map(_.name), equp.map(_.name)))
|
||||
.getOrElse(Names.empty)
|
||||
|
||||
case class TagCount(tag: RTag, count: Int)
|
||||
|
||||
case class InsightData(
|
||||
|
Loading…
x
Reference in New Issue
Block a user