Use collective data with NER annotation

This commit is contained in:
Eike Kettner 2020-08-24 23:25:57 +02:00
parent de5b33c40d
commit 3473cbb773
12 changed files with 413 additions and 76 deletions

View File

@ -20,6 +20,12 @@ case class Duration(nanos: Long) {
def hours: Long = minutes / 60 def hours: Long = minutes / 60
def >(other: Duration): Boolean =
nanos > other.nanos
def <(other: Duration): Boolean =
nanos < other.nanos
def toScala: FiniteDuration = def toScala: FiniteDuration =
FiniteDuration(nanos, TimeUnit.NANOSECONDS) FiniteDuration(nanos, TimeUnit.NANOSECONDS)
@ -62,6 +68,9 @@ object Duration {
def nanos(n: Long): Duration = def nanos(n: Long): Duration =
Duration(n) Duration(n)
def between(start: Timestamp, end: Timestamp): Duration =
apply(JDur.between(start.value, end.value))
def stopTime[F[_]: Sync]: F[F[Duration]] = def stopTime[F[_]: Sync]: F[F[Duration]] =
for { for {
now <- Timestamp.current[F] now <- Timestamp.current[F]

View File

@ -12,6 +12,10 @@ import cats.effect._
import cats.implicits._ import cats.implicits._
import fs2.Stream import fs2.Stream
import docspell.common.syntax.all._
import io.circe.Decoder
object File { object File {
def mkDir[F[_]: Sync](dir: Path): F[Path] = def mkDir[F[_]: Sync](dir: Path): F[Path] =
@ -91,4 +95,10 @@ object File {
def writeString[F[_]: Sync](file: Path, content: String): F[Path] = def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8))) Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit
d: Decoder[A]
): F[A] =
readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow
} }

View File

@ -248,6 +248,29 @@ docspell.joex {
# should suffice. Default is 10000, which are about 2-3 pages # should suffice. Default is 10000, which are about 2-3 pages
# (just a rough guess, of course). # (just a rough guess, of course).
max-length = 10000 max-length = 10000
# A working directory for the analyser to store temporary/working
# files.
working-dir = ${java.io.tmpdir}"/docspell-analysis"
regex-ner {
# Whether to enable custom NER annotation. This uses the address
# book of a collective as input for NER tagging (to automatically
# find correspondent and concerned entities). If the address book
# is large, this can be quite memory intensive and also makes text
# analysis slower. But it greatly improves accuracy. If this is
# false, NER tagging uses only statistical models (that also work
# quite well).
#
# This setting might be moved to the collective settings in the
# future.
enabled = true
# The NER annotation uses a file of patterns that is derived from
# a collective's address book. This is is the time how long this
# file will be kept until a check for a state change is done.
file-cache-time = "1 minute"
}
} }
# Configuration for converting files into PDFs. # Configuration for converting files into PDFs.

View File

@ -1,11 +1,14 @@
package docspell.joex package docspell.joex
import java.nio.file.Path
import docspell.analysis.TextAnalysisConfig import docspell.analysis.TextAnalysisConfig
import docspell.backend.Config.Files import docspell.backend.Config.Files
import docspell.common._ import docspell.common._
import docspell.convert.ConvertConfig import docspell.convert.ConvertConfig
import docspell.extract.ExtractConfig import docspell.extract.ExtractConfig
import docspell.ftssolr.SolrConfig import docspell.ftssolr.SolrConfig
import docspell.joex.analysis.RegexNerFile
import docspell.joex.hk.HouseKeepingConfig import docspell.joex.hk.HouseKeepingConfig
import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig} import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
import docspell.store.JdbcConfig import docspell.store.JdbcConfig
@ -20,7 +23,7 @@ case class Config(
userTasks: Config.UserTasks, userTasks: Config.UserTasks,
houseKeeping: HouseKeepingConfig, houseKeeping: HouseKeepingConfig,
extraction: ExtractConfig, extraction: ExtractConfig,
textAnalysis: TextAnalysisConfig, textAnalysis: Config.TextAnalysis,
convert: ConvertConfig, convert: ConvertConfig,
sendMail: MailSendConfig, sendMail: MailSendConfig,
files: Files, files: Files,
@ -50,4 +53,19 @@ object Config {
} }
case class Processing(maxDueDateYears: Int) case class Processing(maxDueDateYears: Int)
case class TextAnalysis(
maxLength: Int,
workingDir: Path,
regexNer: RegexNer
) {
def textAnalysisConfig: TextAnalysisConfig =
TextAnalysisConfig(maxLength)
def regexNerFileConfig: RegexNerFile.Config =
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
}
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
} }

View File

@ -11,6 +11,7 @@ import docspell.backend.ops._
import docspell.common._ import docspell.common._
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.ftssolr.SolrFtsClient import docspell.ftssolr.SolrFtsClient
import docspell.joex.analysis.RegexNerFile
import docspell.joex.fts.{MigrationTask, ReIndexTask} import docspell.joex.fts.{MigrationTask, ReIndexTask}
import docspell.joex.hk._ import docspell.joex.hk._
import docspell.joex.notify._ import docspell.joex.notify._
@ -89,7 +90,8 @@ object JoexAppImpl {
upload <- OUpload(store, queue, cfg.files, joex) upload <- OUpload(store, queue, cfg.files, joex)
fts <- createFtsClient(cfg)(httpClient) fts <- createFtsClient(cfg)(httpClient)
itemOps <- OItem(store, fts, queue, joex) itemOps <- OItem(store, fts, queue, joex)
analyser <- TextAnalyser.create[F](cfg.textAnalysis) analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig)
regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store)
javaEmil = javaEmil =
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug)) JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
sch <- SchedulerBuilder(cfg.scheduler, blocker, store) sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
@ -97,14 +99,14 @@ object JoexAppImpl {
.withTask( .withTask(
JobTask.json( JobTask.json(
ProcessItemArgs.taskName, ProcessItemArgs.taskName,
ItemHandler.newItem[F](cfg, itemOps, fts, analyser), ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer),
ItemHandler.onCancel[F] ItemHandler.onCancel[F]
) )
) )
.withTask( .withTask(
JobTask.json( JobTask.json(
ReProcessItemArgs.taskName, ReProcessItemArgs.taskName,
ReProcessItem[F](cfg, fts, analyser), ReProcessItem[F](cfg, fts, analyser, regexNer),
ReProcessItem.onCancel[F] ReProcessItem.onCancel[F]
) )
) )

View File

@ -0,0 +1,99 @@
package docspell.joex.analysis
import java.nio.file.Path
import cats.effect._
import cats.implicits._
import docspell.analysis.split.TextSplitter
import docspell.common._
import docspell.store.queries.QCollective
import io.circe.generic.semiauto._
import io.circe.{Decoder, Encoder}
case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) {
def nerFilePath(directory: Path): Path =
NerFile.nerFilePath(directory, collective)
def jsonFilePath(directory: Path) =
NerFile.jsonFilePath(directory, collective)
}
object NerFile {
implicit val jsonDecoder: Decoder[NerFile] =
deriveDecoder[NerFile]
implicit val jsonEncoder: Encoder[NerFile] =
deriveEncoder[NerFile]
private def nerFilePath(directory: Path, collective: Ident): Path =
directory.resolve(s"${collective.id}.txt")
private def jsonFilePath(directory: Path, collective: Ident): Path =
directory.resolve(s"${collective.id}.json")
def find[F[_]: Sync: ContextShift](
collective: Ident,
directory: Path,
blocker: Blocker
): F[Option[NerFile]] = {
val file = jsonFilePath(directory, collective)
File.existsNonEmpty[F](file).flatMap {
case true =>
File
.readJson[F, NerFile](file, blocker)
.map(_.some)
case false =>
(None: Option[NerFile]).pure[F]
}
}
def mkNerConfig(names: QCollective.Names): String = {
val orgs = names.org
.flatMap(Pattern(3))
.distinct
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
val pers =
names.pers
.flatMap(Pattern(2))
.distinct
.map(_.toRow("PERSON", "LOCATION,MISC"))
val equips =
names.equip
.flatMap(Pattern(1))
.distinct
.map(_.toRow("MISC", "LOCATION"))
(orgs ++ pers ++ equips).mkString("\n")
}
case class Pattern(value: String, weight: Int) {
def toRow(tag: String, overrideTags: String): String =
s"$value\t$tag\t$overrideTags\t$weight"
}
object Pattern {
def apply(weight: Int)(str: String): Vector[Pattern] = {
val delims = " \t\n\r".toSet
val words =
TextSplitter
.split(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.map(w => s"(?i)${w}")
val tokens =
TextSplitter
.splitToken(str, delims)
.map(_.toLower.value.trim)
.filter(_.nonEmpty)
.toVector
.take(3)
.map(w => s"(?i)${w}")
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
}
}
}

View File

@ -0,0 +1,164 @@
package docspell.joex.analysis
import java.nio.file.Path
import cats.effect._
import cats.effect.concurrent.Semaphore
import cats.implicits._
import docspell.common._
import docspell.common.syntax.all._
import docspell.store.Store
import docspell.store.queries.QCollective
import docspell.store.records.REquipment
import docspell.store.records.ROrganization
import docspell.store.records.RPerson
import io.circe.syntax._
import org.log4s.getLogger
/** Maintains a custom regex-ner file per collective for stanford's
* regexner annotator.
*/
trait RegexNerFile[F[_]] {
def makeFile(collective: Ident): F[Option[Path]]
}
object RegexNerFile {
private[this] val logger = getLogger
case class Config(enabled: Boolean, directory: Path, minTime: Duration)
def apply[F[_]: Concurrent: ContextShift](
cfg: Config,
blocker: Blocker,
store: Store[F]
): Resource[F, RegexNerFile[F]] =
for {
dir <- File.withTempDir[F](cfg.directory, "regexner-")
writer <- Resource.liftF(Semaphore(1))
} yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer)
final private class Impl[F[_]: Concurrent: ContextShift](
cfg: Config,
blocker: Blocker,
store: Store[F],
writer: Semaphore[F] //TODO allow parallelism per collective
) extends RegexNerFile[F] {
def makeFile(collective: Ident): F[Option[Path]] =
if (cfg.enabled) doMakeFile(collective)
else (None: Option[Path]).pure[F]
def doMakeFile(collective: Ident): F[Option[Path]] =
for {
now <- Timestamp.current[F]
existing <- NerFile.find[F](collective, cfg.directory, blocker)
result <- existing match {
case Some(nf) =>
val dur = Duration.between(nf.creation, now)
if (dur > cfg.minTime)
logger.fdebug(
s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state."
) *> updateFile(
collective,
now,
Some(nf)
)
else nf.nerFilePath(cfg.directory).some.pure[F]
case None =>
updateFile(collective, now, None)
}
} yield result
private def updateFile(
collective: Ident,
now: Timestamp,
current: Option[NerFile]
): F[Option[Path]] =
for {
lastUpdate <- store.transact(Sql.latestUpdate(collective))
result <- lastUpdate match {
case None =>
(None: Option[Path]).pure[F]
case Some(lup) =>
current match {
case Some(cur) =>
val nerf =
if (cur.updated == lup)
logger.fdebug(s"No state change detected.") *> updateTimestamp(
cur,
now
) *> cur.pure[F]
else
logger.fdebug(
s"There have been state changes for collective '${collective.id}'. Reload NER file."
) *> createFile(lup, collective, now)
nerf.map(_.nerFilePath(cfg.directory).some)
case None =>
createFile(lup, collective, now)
.map(_.nerFilePath(cfg.directory).some)
}
}
} yield result
private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] =
writer.withPermit(for {
file <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
_ <- File.mkDir(file.getParent)
_ <- File.writeString(file, nf.copy(creation = now).asJson.spaces2)
} yield ())
private def createFile(
lastUpdate: Timestamp,
collective: Ident,
now: Timestamp
): F[NerFile] = {
def update(nf: NerFile, text: String): F[Unit] =
writer.withPermit(for {
jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
_ <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'")
_ <- File.mkDir(jsonFile.getParent)
_ <- File.writeString(nf.nerFilePath(cfg.directory), text)
_ <- File.writeString(jsonFile, nf.asJson.spaces2)
} yield ())
for {
_ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
names <- store.transact(QCollective.allNames(collective))
nerFile = NerFile(collective, lastUpdate, now)
_ <- update(nerFile, NerFile.mkNerConfig(names))
} yield nerFile
}
}
object Sql {
import doobie._
import doobie.implicits._
import docspell.store.impl.Implicits._
import docspell.store.impl.Column
def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = {
def max(col: Column, table: Fragment, cidCol: Column): Fragment =
selectSimple(col.max ++ fr"as t", table, cidCol.is(collective))
val sql =
List(
max(
ROrganization.Columns.updated,
ROrganization.table,
ROrganization.Columns.cid
),
max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid),
max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid)
)
.reduce(_ ++ fr"UNION ALL" ++ _)
selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty)
.query[Timestamp]
.option
}
}
}

View File

@ -10,6 +10,7 @@ import docspell.backend.ops.OItem
import docspell.common.{ItemState, ProcessItemArgs} import docspell.common.{ItemState, ProcessItemArgs}
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.joex.Config import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.store.queries.QItem import docspell.store.queries.QItem
import docspell.store.records.RItem import docspell.store.records.RItem
@ -31,11 +32,12 @@ object ItemHandler {
cfg: Config, cfg: Config,
itemOps: OItem[F], itemOps: OItem[F],
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F] analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
): Task[F, Args, Unit] = ): Task[F, Args, Unit] =
CreateItem[F] CreateItem[F]
.flatMap(itemStateTask(ItemState.Processing)) .flatMap(itemStateTask(ItemState.Processing))
.flatMap(safeProcess[F](cfg, itemOps, fts, analyser)) .flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
.map(_ => ()) .map(_ => ())
def itemStateTask[F[_]: Sync, A]( def itemStateTask[F[_]: Sync, A](
@ -54,11 +56,12 @@ object ItemHandler {
cfg: Config, cfg: Config,
itemOps: OItem[F], itemOps: OItem[F],
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F] analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(data: ItemData): Task[F, Args, ItemData] = )(data: ItemData): Task[F, Args, ItemData] =
isLastRetry[F].flatMap { isLastRetry[F].flatMap {
case true => case true =>
ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({ ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({
case Right(d) => case Right(d) =>
Task.pure(d) Task.pure(d)
case Left(ex) => case Left(ex) =>
@ -68,7 +71,7 @@ object ItemHandler {
.andThen(_ => Sync[F].raiseError(ex)) .andThen(_ => Sync[F].raiseError(ex))
}) })
case false => case false =>
ProcessItem[F](cfg, itemOps, fts, analyser)(data) ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data)
.flatMap(itemStateTask(ItemState.Created)) .flatMap(itemStateTask(ItemState.Created))
} }

View File

@ -7,6 +7,7 @@ import docspell.backend.ops.OItem
import docspell.common.ProcessItemArgs import docspell.common.ProcessItemArgs
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.joex.Config import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
object ProcessItem { object ProcessItem {
@ -15,11 +16,12 @@ object ProcessItem {
cfg: Config, cfg: Config,
itemOps: OItem[F], itemOps: OItem[F],
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F] analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ExtractArchive(item) ExtractArchive(item)
.flatMap(Task.setProgress(20)) .flatMap(Task.setProgress(20))
.flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80))) .flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80)))
.flatMap(LinkProposal[F]) .flatMap(LinkProposal[F])
.flatMap(SetGivenData[F](itemOps)) .flatMap(SetGivenData[F](itemOps))
.flatMap(Task.setProgress(99)) .flatMap(Task.setProgress(99))
@ -27,15 +29,17 @@ object ProcessItem {
def processAttachments[F[_]: ConcurrentEffect: ContextShift]( def processAttachments[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F] analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item) processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
def analysisOnly[F[_]: Sync]( def analysisOnly[F[_]: Sync](
cfg: Config, cfg: Config,
analyser: TextAnalyser[F] analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](analyser)(item) TextAnalysis[F](analyser, regexNer)(item)
.flatMap(FindProposal[F](cfg.processing)) .flatMap(FindProposal[F](cfg.processing))
.flatMap(EvalProposals[F]) .flatMap(EvalProposals[F])
.flatMap(SaveProposals[F]) .flatMap(SaveProposals[F])
@ -44,12 +48,13 @@ object ProcessItem {
cfg: Config, cfg: Config,
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F], analyser: TextAnalyser[F],
regexNer: RegexNerFile[F],
progress: (Int, Int, Int) progress: (Int, Int, Int)
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
ConvertPdf(cfg.convert, item) ConvertPdf(cfg.convert, item)
.flatMap(Task.setProgress(progress._1)) .flatMap(Task.setProgress(progress._1))
.flatMap(TextExtraction(cfg.extraction, fts)) .flatMap(TextExtraction(cfg.extraction, fts))
.flatMap(Task.setProgress(progress._2)) .flatMap(Task.setProgress(progress._2))
.flatMap(analysisOnly[F](cfg, analyser)) .flatMap(analysisOnly[F](cfg, analyser, regexNer))
.flatMap(Task.setProgress(progress._3)) .flatMap(Task.setProgress(progress._3))
} }

View File

@ -8,6 +8,7 @@ import docspell.analysis.TextAnalyser
import docspell.common._ import docspell.common._
import docspell.ftsclient.FtsClient import docspell.ftsclient.FtsClient
import docspell.joex.Config import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.scheduler.Context import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.store.records.RAttachment import docspell.store.records.RAttachment
@ -21,10 +22,11 @@ object ReProcessItem {
def apply[F[_]: ConcurrentEffect: ContextShift]( def apply[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F] analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
): Task[F, Args, Unit] = ): Task[F, Args, Unit] =
loadItem[F] loadItem[F]
.flatMap(safeProcess[F](cfg, fts, analyser)) .flatMap(safeProcess[F](cfg, fts, analyser, regexNer))
.map(_ => ()) .map(_ => ())
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] = def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
@ -73,6 +75,7 @@ object ReProcessItem {
cfg: Config, cfg: Config,
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F], analyser: TextAnalyser[F],
regexNer: RegexNerFile[F],
data: ItemData data: ItemData
): Task[F, Args, ItemData] = { ): Task[F, Args, ItemData] = {
@ -94,7 +97,7 @@ object ReProcessItem {
getLanguage[F].flatMap { lang => getLanguage[F].flatMap { lang =>
ProcessItem ProcessItem
.processAttachments[F](cfg, fts, analyser)(data) .processAttachments[F](cfg, fts, analyser, regexNer)(data)
.contramap[Args](convertArgs(lang)) .contramap[Args](convertArgs(lang))
} }
} }
@ -113,11 +116,12 @@ object ReProcessItem {
def safeProcess[F[_]: ConcurrentEffect: ContextShift]( def safeProcess[F[_]: ConcurrentEffect: ContextShift](
cfg: Config, cfg: Config,
fts: FtsClient[F], fts: FtsClient[F],
analyser: TextAnalyser[F] analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(data: ItemData): Task[F, Args, ItemData] = )(data: ItemData): Task[F, Args, ItemData] =
isLastRetry[F].flatMap { isLastRetry[F].flatMap {
case true => case true =>
processFiles[F](cfg, fts, analyser, data).attempt processFiles[F](cfg, fts, analyser, regexNer, data).attempt
.flatMap({ .flatMap({
case Right(d) => case Right(d) =>
Task.pure(d) Task.pure(d)
@ -127,7 +131,7 @@ object ReProcessItem {
).andThen(_ => Sync[F].raiseError(ex)) ).andThen(_ => Sync[F].raiseError(ex))
}) })
case false => case false =>
processFiles[F](cfg, fts, analyser, data) processFiles[F](cfg, fts, analyser, regexNer, data)
} }
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] = private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =

View File

@ -1,24 +1,22 @@
package docspell.joex.process package docspell.joex.process
import java.nio.file.Paths
import cats.effect._ import cats.effect._
import cats.implicits._ import cats.implicits._
import docspell.analysis.TextAnalyser import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.StanfordSettings import docspell.analysis.nlp.StanfordSettings
import docspell.analysis.split.TextSplitter
import docspell.common._ import docspell.common._
import docspell.joex.analysis.RegexNerFile
import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.store.queries.QCollective
import docspell.store.records.RAttachmentMeta import docspell.store.records.RAttachmentMeta
object TextAnalysis { object TextAnalysis {
def apply[F[_]: Sync]( def apply[F[_]: Sync](
analyser: TextAnalyser[F] analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] = )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx => Task { ctx =>
for { for {
@ -27,7 +25,7 @@ object TextAnalysis {
t <- t <-
item.metas.toList item.metas.toList
.traverse( .traverse(
annotateAttachment[F](ctx, analyser) annotateAttachment[F](ctx, analyser, nerFile)
) )
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}") _ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
_ <- t.traverse(m => _ <- t.traverse(m =>
@ -41,63 +39,19 @@ object TextAnalysis {
def annotateAttachment[F[_]: Sync]( def annotateAttachment[F[_]: Sync](
ctx: Context[F, ProcessItemArgs], ctx: Context[F, ProcessItemArgs],
analyser: TextAnalyser[F] analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
val settings = StanfordSettings(ctx.args.meta.language, false, None) val settings = StanfordSettings(ctx.args.meta.language, false, None)
for { for {
names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective)) customNer <- nerFile.makeFile(ctx.args.meta.collective)
temp <- File.mkTempFile(Paths.get("."), "textanalysis") sett = settings.copy(regexNer = customNer)
_ <- File.writeString(temp, mkNerConfig(names))
sett = settings.copy(regexNer = Some(temp))
labels <- analyser.annotate( labels <- analyser.annotate(
ctx.logger, ctx.logger,
sett, sett,
ctx.args.meta.collective, ctx.args.meta.collective,
rm.content.getOrElse("") rm.content.getOrElse("")
) )
_ <- File.deleteFile(temp)
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
} }
def mkNerConfig(names: QCollective.Names): String = {
val orgs = names.org
.flatMap(Pattern(3))
.distinct
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
val pers =
names.pers
.flatMap(Pattern(2))
.distinct
.map(_.toRow("PERSON", "LOCATION,MISC"))
val equips =
names.equip
.flatMap(Pattern(1))
.distinct
.map(_.toRow("MISC", "LOCATION"))
(orgs ++ pers ++ equips).mkString("\n")
}
case class Pattern(value: String, weight: Int) {
def toRow(tag: String, overrideTags: String): String =
s"$value\t$tag\t$overrideTags\t$weight"
}
object Pattern {
def apply(weight: Int)(str: String): Vector[Pattern] = {
val delims = " \t\n\r".toSet
val words =
TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
val tokens =
TextSplitter
.splitToken(str, delims)
.toVector
.take(3)
.map(w => s"(?i)${w.toLower.value}")
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
}
}
} }

View File

@ -91,6 +91,11 @@ let
}; };
text-analysis = { text-analysis = {
max-length = 10000; max-length = 10000;
regex-ner = {
enabled = true;
file-cache-time = "1 minute";
};
working-dir = "/tmp/docspell-analysis";
}; };
processing = { processing = {
max-due-date-years = 10; max-due-date-years = 10;
@ -689,7 +694,48 @@ in {
(a rough guess). (a rough guess).
''; '';
}; };
working-dir = mkOption {
type = types.str;
default = defaults.text-analysis.working-dir;
description = ''
A working directory for the analyser to store temporary/working
files.
'';
};
regex-ner = mkOption {
type = types.submodule({
options = {
enabled = mkOption {
type = types.bool;
default = defaults.text-analysis.regex-ner.enabled;
description = ''
Whether to enable custom NER annotation. This uses the address
book of a collective as input for NER tagging (to automatically
find correspondent and concerned entities). If the address book
is large, this can be quite memory intensive and also makes text
analysis slower. But it greatly improves accuracy. If this is
false, NER tagging uses only statistical models (that also work
quite well).
This setting might be moved to the collective settings in the
future.
'';
};
file-cache-time = mkOption {
type = types.str;
default = defaults.text-analysis.ner-file-cache-time;
description = ''
The NER annotation uses a file of patterns that is derived from
a collective's address book. This is is the time how long this
file will be kept until a check for a state change is done.
'';
};
};
});
default = defaults.text-analysis.regex-ner;
description = "";
};
}; };
}); });
default = defaults.text-analysis; default = defaults.text-analysis;