diff --git a/modules/common/src/main/scala/docspell/common/Duration.scala b/modules/common/src/main/scala/docspell/common/Duration.scala index f154a292..1c290c95 100644 --- a/modules/common/src/main/scala/docspell/common/Duration.scala +++ b/modules/common/src/main/scala/docspell/common/Duration.scala @@ -20,6 +20,12 @@ case class Duration(nanos: Long) { def hours: Long = minutes / 60 + def >(other: Duration): Boolean = + nanos > other.nanos + + def <(other: Duration): Boolean = + nanos < other.nanos + def toScala: FiniteDuration = FiniteDuration(nanos, TimeUnit.NANOSECONDS) @@ -62,6 +68,9 @@ object Duration { def nanos(n: Long): Duration = Duration(n) + def between(start: Timestamp, end: Timestamp): Duration = + apply(JDur.between(start.value, end.value)) + def stopTime[F[_]: Sync]: F[F[Duration]] = for { now <- Timestamp.current[F] diff --git a/modules/common/src/main/scala/docspell/common/File.scala b/modules/common/src/main/scala/docspell/common/File.scala index 2d5cfb8a..572291c5 100644 --- a/modules/common/src/main/scala/docspell/common/File.scala +++ b/modules/common/src/main/scala/docspell/common/File.scala @@ -12,6 +12,10 @@ import cats.effect._ import cats.implicits._ import fs2.Stream +import docspell.common.syntax.all._ + +import io.circe.Decoder + object File { def mkDir[F[_]: Sync](dir: Path): F[Path] = @@ -91,4 +95,10 @@ object File { def writeString[F[_]: Sync](file: Path, content: String): F[Path] = Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8))) + + def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit + d: Decoder[A] + ): F[A] = + readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow + } diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index bd0de234..115d2893 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -248,6 +248,29 @@ docspell.joex { # should suffice. Default is 10000, which are about 2-3 pages # (just a rough guess, of course). max-length = 10000 + + # A working directory for the analyser to store temporary/working + # files. + working-dir = ${java.io.tmpdir}"/docspell-analysis" + + regex-ner { + # Whether to enable custom NER annotation. This uses the address + # book of a collective as input for NER tagging (to automatically + # find correspondent and concerned entities). If the address book + # is large, this can be quite memory intensive and also makes text + # analysis slower. But it greatly improves accuracy. If this is + # false, NER tagging uses only statistical models (that also work + # quite well). + # + # This setting might be moved to the collective settings in the + # future. + enabled = true + + # The NER annotation uses a file of patterns that is derived from + # a collective's address book. This is is the time how long this + # file will be kept until a check for a state change is done. + file-cache-time = "1 minute" + } } # Configuration for converting files into PDFs. diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 3625ffb1..cb6bb9f3 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -1,11 +1,14 @@ package docspell.joex +import java.nio.file.Path + import docspell.analysis.TextAnalysisConfig import docspell.backend.Config.Files import docspell.common._ import docspell.convert.ConvertConfig import docspell.extract.ExtractConfig import docspell.ftssolr.SolrConfig +import docspell.joex.analysis.RegexNerFile import docspell.joex.hk.HouseKeepingConfig import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig} import docspell.store.JdbcConfig @@ -20,7 +23,7 @@ case class Config( userTasks: Config.UserTasks, houseKeeping: HouseKeepingConfig, extraction: ExtractConfig, - textAnalysis: TextAnalysisConfig, + textAnalysis: Config.TextAnalysis, convert: ConvertConfig, sendMail: MailSendConfig, files: Files, @@ -50,4 +53,19 @@ object Config { } case class Processing(maxDueDateYears: Int) + + case class TextAnalysis( + maxLength: Int, + workingDir: Path, + regexNer: RegexNer + ) { + + def textAnalysisConfig: TextAnalysisConfig = + TextAnalysisConfig(maxLength) + + def regexNerFileConfig: RegexNerFile.Config = + RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime) + } + + case class RegexNer(enabled: Boolean, fileCacheTime: Duration) } diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index dcea79df..2fa94c25 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -11,6 +11,7 @@ import docspell.backend.ops._ import docspell.common._ import docspell.ftsclient.FtsClient import docspell.ftssolr.SolrFtsClient +import docspell.joex.analysis.RegexNerFile import docspell.joex.fts.{MigrationTask, ReIndexTask} import docspell.joex.hk._ import docspell.joex.notify._ @@ -89,7 +90,8 @@ object JoexAppImpl { upload <- OUpload(store, queue, cfg.files, joex) fts <- createFtsClient(cfg)(httpClient) itemOps <- OItem(store, fts, queue, joex) - analyser <- TextAnalyser.create[F](cfg.textAnalysis) + analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig) + regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store) javaEmil = JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug)) sch <- SchedulerBuilder(cfg.scheduler, blocker, store) @@ -97,14 +99,14 @@ object JoexAppImpl { .withTask( JobTask.json( ProcessItemArgs.taskName, - ItemHandler.newItem[F](cfg, itemOps, fts, analyser), + ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer), ItemHandler.onCancel[F] ) ) .withTask( JobTask.json( ReProcessItemArgs.taskName, - ReProcessItem[F](cfg, fts, analyser), + ReProcessItem[F](cfg, fts, analyser, regexNer), ReProcessItem.onCancel[F] ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala new file mode 100644 index 00000000..f7abe029 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/analysis/NerFile.scala @@ -0,0 +1,99 @@ +package docspell.joex.analysis + +import java.nio.file.Path + +import cats.effect._ +import cats.implicits._ + +import docspell.analysis.split.TextSplitter +import docspell.common._ +import docspell.store.queries.QCollective + +import io.circe.generic.semiauto._ +import io.circe.{Decoder, Encoder} + +case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) { + def nerFilePath(directory: Path): Path = + NerFile.nerFilePath(directory, collective) + + def jsonFilePath(directory: Path) = + NerFile.jsonFilePath(directory, collective) +} + +object NerFile { + implicit val jsonDecoder: Decoder[NerFile] = + deriveDecoder[NerFile] + + implicit val jsonEncoder: Encoder[NerFile] = + deriveEncoder[NerFile] + + private def nerFilePath(directory: Path, collective: Ident): Path = + directory.resolve(s"${collective.id}.txt") + + private def jsonFilePath(directory: Path, collective: Ident): Path = + directory.resolve(s"${collective.id}.json") + + def find[F[_]: Sync: ContextShift]( + collective: Ident, + directory: Path, + blocker: Blocker + ): F[Option[NerFile]] = { + val file = jsonFilePath(directory, collective) + File.existsNonEmpty[F](file).flatMap { + case true => + File + .readJson[F, NerFile](file, blocker) + .map(_.some) + case false => + (None: Option[NerFile]).pure[F] + } + } + + def mkNerConfig(names: QCollective.Names): String = { + val orgs = names.org + .flatMap(Pattern(3)) + .distinct + .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC")) + + val pers = + names.pers + .flatMap(Pattern(2)) + .distinct + .map(_.toRow("PERSON", "LOCATION,MISC")) + + val equips = + names.equip + .flatMap(Pattern(1)) + .distinct + .map(_.toRow("MISC", "LOCATION")) + + (orgs ++ pers ++ equips).mkString("\n") + } + case class Pattern(value: String, weight: Int) { + def toRow(tag: String, overrideTags: String): String = + s"$value\t$tag\t$overrideTags\t$weight" + } + + object Pattern { + def apply(weight: Int)(str: String): Vector[Pattern] = { + val delims = " \t\n\r".toSet + val words = + TextSplitter + .split(str, delims) + .map(_.toLower.value.trim) + .filter(_.nonEmpty) + .toVector + .map(w => s"(?i)${w}") + val tokens = + TextSplitter + .splitToken(str, delims) + .map(_.toLower.value.trim) + .filter(_.nonEmpty) + .toVector + .take(3) + .map(w => s"(?i)${w}") + + tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight)) + } + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala new file mode 100644 index 00000000..570fc659 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/analysis/RegexNerFile.scala @@ -0,0 +1,164 @@ +package docspell.joex.analysis + +import java.nio.file.Path + +import cats.effect._ +import cats.effect.concurrent.Semaphore +import cats.implicits._ + +import docspell.common._ +import docspell.common.syntax.all._ +import docspell.store.Store +import docspell.store.queries.QCollective +import docspell.store.records.REquipment +import docspell.store.records.ROrganization +import docspell.store.records.RPerson + +import io.circe.syntax._ +import org.log4s.getLogger + +/** Maintains a custom regex-ner file per collective for stanford's + * regexner annotator. + */ +trait RegexNerFile[F[_]] { + + def makeFile(collective: Ident): F[Option[Path]] + +} + +object RegexNerFile { + private[this] val logger = getLogger + + case class Config(enabled: Boolean, directory: Path, minTime: Duration) + + def apply[F[_]: Concurrent: ContextShift]( + cfg: Config, + blocker: Blocker, + store: Store[F] + ): Resource[F, RegexNerFile[F]] = + for { + dir <- File.withTempDir[F](cfg.directory, "regexner-") + writer <- Resource.liftF(Semaphore(1)) + } yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer) + + final private class Impl[F[_]: Concurrent: ContextShift]( + cfg: Config, + blocker: Blocker, + store: Store[F], + writer: Semaphore[F] //TODO allow parallelism per collective + ) extends RegexNerFile[F] { + + def makeFile(collective: Ident): F[Option[Path]] = + if (cfg.enabled) doMakeFile(collective) + else (None: Option[Path]).pure[F] + + def doMakeFile(collective: Ident): F[Option[Path]] = + for { + now <- Timestamp.current[F] + existing <- NerFile.find[F](collective, cfg.directory, blocker) + result <- existing match { + case Some(nf) => + val dur = Duration.between(nf.creation, now) + if (dur > cfg.minTime) + logger.fdebug( + s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state." + ) *> updateFile( + collective, + now, + Some(nf) + ) + else nf.nerFilePath(cfg.directory).some.pure[F] + case None => + updateFile(collective, now, None) + } + } yield result + + private def updateFile( + collective: Ident, + now: Timestamp, + current: Option[NerFile] + ): F[Option[Path]] = + for { + lastUpdate <- store.transact(Sql.latestUpdate(collective)) + result <- lastUpdate match { + case None => + (None: Option[Path]).pure[F] + case Some(lup) => + current match { + case Some(cur) => + val nerf = + if (cur.updated == lup) + logger.fdebug(s"No state change detected.") *> updateTimestamp( + cur, + now + ) *> cur.pure[F] + else + logger.fdebug( + s"There have been state changes for collective '${collective.id}'. Reload NER file." + ) *> createFile(lup, collective, now) + nerf.map(_.nerFilePath(cfg.directory).some) + case None => + createFile(lup, collective, now) + .map(_.nerFilePath(cfg.directory).some) + } + } + } yield result + + private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] = + writer.withPermit(for { + file <- Sync[F].pure(nf.jsonFilePath(cfg.directory)) + _ <- File.mkDir(file.getParent) + _ <- File.writeString(file, nf.copy(creation = now).asJson.spaces2) + } yield ()) + + private def createFile( + lastUpdate: Timestamp, + collective: Ident, + now: Timestamp + ): F[NerFile] = { + def update(nf: NerFile, text: String): F[Unit] = + writer.withPermit(for { + jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory)) + _ <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'") + _ <- File.mkDir(jsonFile.getParent) + _ <- File.writeString(nf.nerFilePath(cfg.directory), text) + _ <- File.writeString(jsonFile, nf.asJson.spaces2) + } yield ()) + + for { + _ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'") + names <- store.transact(QCollective.allNames(collective)) + nerFile = NerFile(collective, lastUpdate, now) + _ <- update(nerFile, NerFile.mkNerConfig(names)) + } yield nerFile + } + } + + object Sql { + import doobie._ + import doobie.implicits._ + import docspell.store.impl.Implicits._ + import docspell.store.impl.Column + + def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = { + def max(col: Column, table: Fragment, cidCol: Column): Fragment = + selectSimple(col.max ++ fr"as t", table, cidCol.is(collective)) + + val sql = + List( + max( + ROrganization.Columns.updated, + ROrganization.table, + ROrganization.Columns.cid + ), + max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid), + max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid) + ) + .reduce(_ ++ fr"UNION ALL" ++ _) + + selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty) + .query[Timestamp] + .option + } + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala index 240e7f54..acbf810b 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala @@ -10,6 +10,7 @@ import docspell.backend.ops.OItem import docspell.common.{ItemState, ProcessItemArgs} import docspell.ftsclient.FtsClient import docspell.joex.Config +import docspell.joex.analysis.RegexNerFile import docspell.joex.scheduler.Task import docspell.store.queries.QItem import docspell.store.records.RItem @@ -31,11 +32,12 @@ object ItemHandler { cfg: Config, itemOps: OItem[F], fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] ): Task[F, Args, Unit] = CreateItem[F] .flatMap(itemStateTask(ItemState.Processing)) - .flatMap(safeProcess[F](cfg, itemOps, fts, analyser)) + .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer)) .map(_ => ()) def itemStateTask[F[_]: Sync, A]( @@ -54,11 +56,12 @@ object ItemHandler { cfg: Config, itemOps: OItem[F], fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(data: ItemData): Task[F, Args, ItemData] = isLastRetry[F].flatMap { case true => - ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({ + ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({ case Right(d) => Task.pure(d) case Left(ex) => @@ -68,7 +71,7 @@ object ItemHandler { .andThen(_ => Sync[F].raiseError(ex)) }) case false => - ProcessItem[F](cfg, itemOps, fts, analyser)(data) + ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data) .flatMap(itemStateTask(ItemState.Created)) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index cd76e095..7b8b6431 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -7,6 +7,7 @@ import docspell.backend.ops.OItem import docspell.common.ProcessItemArgs import docspell.ftsclient.FtsClient import docspell.joex.Config +import docspell.joex.analysis.RegexNerFile import docspell.joex.scheduler.Task object ProcessItem { @@ -15,11 +16,12 @@ object ProcessItem { cfg: Config, itemOps: OItem[F], fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = ExtractArchive(item) .flatMap(Task.setProgress(20)) - .flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80))) + .flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80))) .flatMap(LinkProposal[F]) .flatMap(SetGivenData[F](itemOps)) .flatMap(Task.setProgress(99)) @@ -27,15 +29,17 @@ object ProcessItem { def processAttachments[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item) + processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item) def analysisOnly[F[_]: Sync]( cfg: Config, - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextAnalysis[F](analyser)(item) + TextAnalysis[F](analyser, regexNer)(item) .flatMap(FindProposal[F](cfg.processing)) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) @@ -44,12 +48,13 @@ object ProcessItem { cfg: Config, fts: FtsClient[F], analyser: TextAnalyser[F], + regexNer: RegexNerFile[F], progress: (Int, Int, Int) )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = ConvertPdf(cfg.convert, item) .flatMap(Task.setProgress(progress._1)) .flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(Task.setProgress(progress._2)) - .flatMap(analysisOnly[F](cfg, analyser)) + .flatMap(analysisOnly[F](cfg, analyser, regexNer)) .flatMap(Task.setProgress(progress._3)) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala index 53282539..bf6d2467 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala @@ -8,6 +8,7 @@ import docspell.analysis.TextAnalyser import docspell.common._ import docspell.ftsclient.FtsClient import docspell.joex.Config +import docspell.joex.analysis.RegexNerFile import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task import docspell.store.records.RAttachment @@ -21,10 +22,11 @@ object ReProcessItem { def apply[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] ): Task[F, Args, Unit] = loadItem[F] - .flatMap(safeProcess[F](cfg, fts, analyser)) + .flatMap(safeProcess[F](cfg, fts, analyser, regexNer)) .map(_ => ()) def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] = @@ -73,6 +75,7 @@ object ReProcessItem { cfg: Config, fts: FtsClient[F], analyser: TextAnalyser[F], + regexNer: RegexNerFile[F], data: ItemData ): Task[F, Args, ItemData] = { @@ -94,7 +97,7 @@ object ReProcessItem { getLanguage[F].flatMap { lang => ProcessItem - .processAttachments[F](cfg, fts, analyser)(data) + .processAttachments[F](cfg, fts, analyser, regexNer)(data) .contramap[Args](convertArgs(lang)) } } @@ -113,11 +116,12 @@ object ReProcessItem { def safeProcess[F[_]: ConcurrentEffect: ContextShift]( cfg: Config, fts: FtsClient[F], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + regexNer: RegexNerFile[F] )(data: ItemData): Task[F, Args, ItemData] = isLastRetry[F].flatMap { case true => - processFiles[F](cfg, fts, analyser, data).attempt + processFiles[F](cfg, fts, analyser, regexNer, data).attempt .flatMap({ case Right(d) => Task.pure(d) @@ -127,7 +131,7 @@ object ReProcessItem { ).andThen(_ => Sync[F].raiseError(ex)) }) case false => - processFiles[F](cfg, fts, analyser, data) + processFiles[F](cfg, fts, analyser, regexNer, data) } private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 9ee3850c..abbb6870 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,24 +1,22 @@ package docspell.joex.process -import java.nio.file.Paths - import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser import docspell.analysis.nlp.StanfordSettings -import docspell.analysis.split.TextSplitter import docspell.common._ +import docspell.joex.analysis.RegexNerFile import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task -import docspell.store.queries.QCollective import docspell.store.records.RAttachmentMeta object TextAnalysis { def apply[F[_]: Sync]( - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + nerFile: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = Task { ctx => for { @@ -27,7 +25,7 @@ object TextAnalysis { t <- item.metas.toList .traverse( - annotateAttachment[F](ctx, analyser) + annotateAttachment[F](ctx, analyser, nerFile) ) _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") _ <- t.traverse(m => @@ -41,63 +39,19 @@ object TextAnalysis { def annotateAttachment[F[_]: Sync]( ctx: Context[F, ProcessItemArgs], - analyser: TextAnalyser[F] + analyser: TextAnalyser[F], + nerFile: RegexNerFile[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { val settings = StanfordSettings(ctx.args.meta.language, false, None) for { - names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective)) - temp <- File.mkTempFile(Paths.get("."), "textanalysis") - _ <- File.writeString(temp, mkNerConfig(names)) - sett = settings.copy(regexNer = Some(temp)) + customNer <- nerFile.makeFile(ctx.args.meta.collective) + sett = settings.copy(regexNer = customNer) labels <- analyser.annotate( ctx.logger, sett, ctx.args.meta.collective, rm.content.getOrElse("") ) - _ <- File.deleteFile(temp) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } - - def mkNerConfig(names: QCollective.Names): String = { - val orgs = names.org - .flatMap(Pattern(3)) - .distinct - .map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC")) - - val pers = - names.pers - .flatMap(Pattern(2)) - .distinct - .map(_.toRow("PERSON", "LOCATION,MISC")) - - val equips = - names.equip - .flatMap(Pattern(1)) - .distinct - .map(_.toRow("MISC", "LOCATION")) - - (orgs ++ pers ++ equips).mkString("\n") - } - - case class Pattern(value: String, weight: Int) { - def toRow(tag: String, overrideTags: String): String = - s"$value\t$tag\t$overrideTags\t$weight" - } - - object Pattern { - def apply(weight: Int)(str: String): Vector[Pattern] = { - val delims = " \t\n\r".toSet - val words = - TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}") - val tokens = - TextSplitter - .splitToken(str, delims) - .toVector - .take(3) - .map(w => s"(?i)${w.toLower.value}") - - tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight)) - } - } } diff --git a/nix/module-joex.nix b/nix/module-joex.nix index 6e16581f..d550c2d3 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -91,6 +91,11 @@ let }; text-analysis = { max-length = 10000; + regex-ner = { + enabled = true; + file-cache-time = "1 minute"; + }; + working-dir = "/tmp/docspell-analysis"; }; processing = { max-due-date-years = 10; @@ -689,7 +694,48 @@ in { (a rough guess). ''; }; + working-dir = mkOption { + type = types.str; + default = defaults.text-analysis.working-dir; + description = '' + A working directory for the analyser to store temporary/working + files. + ''; + }; + regex-ner = mkOption { + type = types.submodule({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.text-analysis.regex-ner.enabled; + description = '' + Whether to enable custom NER annotation. This uses the address + book of a collective as input for NER tagging (to automatically + find correspondent and concerned entities). If the address book + is large, this can be quite memory intensive and also makes text + analysis slower. But it greatly improves accuracy. If this is + false, NER tagging uses only statistical models (that also work + quite well). + + This setting might be moved to the collective settings in the + future. + ''; + }; + file-cache-time = mkOption { + type = types.str; + default = defaults.text-analysis.ner-file-cache-time; + description = '' + The NER annotation uses a file of patterns that is derived from + a collective's address book. This is is the time how long this + file will be kept until a check for a state change is done. + ''; + }; + }; + }); + default = defaults.text-analysis.regex-ner; + description = ""; + }; }; }); default = defaults.text-analysis;