mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-05 22:55:58 +00:00
Use collective data with NER annotation
This commit is contained in:
parent
de5b33c40d
commit
3473cbb773
@ -20,6 +20,12 @@ case class Duration(nanos: Long) {
|
|||||||
|
|
||||||
def hours: Long = minutes / 60
|
def hours: Long = minutes / 60
|
||||||
|
|
||||||
|
def >(other: Duration): Boolean =
|
||||||
|
nanos > other.nanos
|
||||||
|
|
||||||
|
def <(other: Duration): Boolean =
|
||||||
|
nanos < other.nanos
|
||||||
|
|
||||||
def toScala: FiniteDuration =
|
def toScala: FiniteDuration =
|
||||||
FiniteDuration(nanos, TimeUnit.NANOSECONDS)
|
FiniteDuration(nanos, TimeUnit.NANOSECONDS)
|
||||||
|
|
||||||
@ -62,6 +68,9 @@ object Duration {
|
|||||||
def nanos(n: Long): Duration =
|
def nanos(n: Long): Duration =
|
||||||
Duration(n)
|
Duration(n)
|
||||||
|
|
||||||
|
def between(start: Timestamp, end: Timestamp): Duration =
|
||||||
|
apply(JDur.between(start.value, end.value))
|
||||||
|
|
||||||
def stopTime[F[_]: Sync]: F[F[Duration]] =
|
def stopTime[F[_]: Sync]: F[F[Duration]] =
|
||||||
for {
|
for {
|
||||||
now <- Timestamp.current[F]
|
now <- Timestamp.current[F]
|
||||||
|
@ -12,6 +12,10 @@ import cats.effect._
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.common.syntax.all._
|
||||||
|
|
||||||
|
import io.circe.Decoder
|
||||||
|
|
||||||
object File {
|
object File {
|
||||||
|
|
||||||
def mkDir[F[_]: Sync](dir: Path): F[Path] =
|
def mkDir[F[_]: Sync](dir: Path): F[Path] =
|
||||||
@ -91,4 +95,10 @@ object File {
|
|||||||
|
|
||||||
def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
|
def writeString[F[_]: Sync](file: Path, content: String): F[Path] =
|
||||||
Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
|
Sync[F].delay(Files.write(file, content.getBytes(StandardCharsets.UTF_8)))
|
||||||
|
|
||||||
|
def readJson[F[_]: Sync: ContextShift, A](file: Path, blocker: Blocker)(implicit
|
||||||
|
d: Decoder[A]
|
||||||
|
): F[A] =
|
||||||
|
readText[F](file, blocker).map(_.parseJsonAs[A]).rethrow
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -248,6 +248,29 @@ docspell.joex {
|
|||||||
# should suffice. Default is 10000, which are about 2-3 pages
|
# should suffice. Default is 10000, which are about 2-3 pages
|
||||||
# (just a rough guess, of course).
|
# (just a rough guess, of course).
|
||||||
max-length = 10000
|
max-length = 10000
|
||||||
|
|
||||||
|
# A working directory for the analyser to store temporary/working
|
||||||
|
# files.
|
||||||
|
working-dir = ${java.io.tmpdir}"/docspell-analysis"
|
||||||
|
|
||||||
|
regex-ner {
|
||||||
|
# Whether to enable custom NER annotation. This uses the address
|
||||||
|
# book of a collective as input for NER tagging (to automatically
|
||||||
|
# find correspondent and concerned entities). If the address book
|
||||||
|
# is large, this can be quite memory intensive and also makes text
|
||||||
|
# analysis slower. But it greatly improves accuracy. If this is
|
||||||
|
# false, NER tagging uses only statistical models (that also work
|
||||||
|
# quite well).
|
||||||
|
#
|
||||||
|
# This setting might be moved to the collective settings in the
|
||||||
|
# future.
|
||||||
|
enabled = true
|
||||||
|
|
||||||
|
# The NER annotation uses a file of patterns that is derived from
|
||||||
|
# a collective's address book. This is is the time how long this
|
||||||
|
# file will be kept until a check for a state change is done.
|
||||||
|
file-cache-time = "1 minute"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Configuration for converting files into PDFs.
|
# Configuration for converting files into PDFs.
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
package docspell.joex
|
package docspell.joex
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
import docspell.analysis.TextAnalysisConfig
|
import docspell.analysis.TextAnalysisConfig
|
||||||
import docspell.backend.Config.Files
|
import docspell.backend.Config.Files
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.convert.ConvertConfig
|
import docspell.convert.ConvertConfig
|
||||||
import docspell.extract.ExtractConfig
|
import docspell.extract.ExtractConfig
|
||||||
import docspell.ftssolr.SolrConfig
|
import docspell.ftssolr.SolrConfig
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.hk.HouseKeepingConfig
|
import docspell.joex.hk.HouseKeepingConfig
|
||||||
import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
|
import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
|
||||||
import docspell.store.JdbcConfig
|
import docspell.store.JdbcConfig
|
||||||
@ -20,7 +23,7 @@ case class Config(
|
|||||||
userTasks: Config.UserTasks,
|
userTasks: Config.UserTasks,
|
||||||
houseKeeping: HouseKeepingConfig,
|
houseKeeping: HouseKeepingConfig,
|
||||||
extraction: ExtractConfig,
|
extraction: ExtractConfig,
|
||||||
textAnalysis: TextAnalysisConfig,
|
textAnalysis: Config.TextAnalysis,
|
||||||
convert: ConvertConfig,
|
convert: ConvertConfig,
|
||||||
sendMail: MailSendConfig,
|
sendMail: MailSendConfig,
|
||||||
files: Files,
|
files: Files,
|
||||||
@ -50,4 +53,19 @@ object Config {
|
|||||||
}
|
}
|
||||||
|
|
||||||
case class Processing(maxDueDateYears: Int)
|
case class Processing(maxDueDateYears: Int)
|
||||||
|
|
||||||
|
case class TextAnalysis(
|
||||||
|
maxLength: Int,
|
||||||
|
workingDir: Path,
|
||||||
|
regexNer: RegexNer
|
||||||
|
) {
|
||||||
|
|
||||||
|
def textAnalysisConfig: TextAnalysisConfig =
|
||||||
|
TextAnalysisConfig(maxLength)
|
||||||
|
|
||||||
|
def regexNerFileConfig: RegexNerFile.Config =
|
||||||
|
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
|
||||||
|
}
|
||||||
|
|
||||||
|
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
|
||||||
}
|
}
|
||||||
|
@ -11,6 +11,7 @@ import docspell.backend.ops._
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
import docspell.ftssolr.SolrFtsClient
|
import docspell.ftssolr.SolrFtsClient
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.fts.{MigrationTask, ReIndexTask}
|
import docspell.joex.fts.{MigrationTask, ReIndexTask}
|
||||||
import docspell.joex.hk._
|
import docspell.joex.hk._
|
||||||
import docspell.joex.notify._
|
import docspell.joex.notify._
|
||||||
@ -89,7 +90,8 @@ object JoexAppImpl {
|
|||||||
upload <- OUpload(store, queue, cfg.files, joex)
|
upload <- OUpload(store, queue, cfg.files, joex)
|
||||||
fts <- createFtsClient(cfg)(httpClient)
|
fts <- createFtsClient(cfg)(httpClient)
|
||||||
itemOps <- OItem(store, fts, queue, joex)
|
itemOps <- OItem(store, fts, queue, joex)
|
||||||
analyser <- TextAnalyser.create[F](cfg.textAnalysis)
|
analyser <- TextAnalyser.create[F](cfg.textAnalysis.textAnalysisConfig)
|
||||||
|
regexNer <- RegexNerFile(cfg.textAnalysis.regexNerFileConfig, blocker, store)
|
||||||
javaEmil =
|
javaEmil =
|
||||||
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
|
JavaMailEmil(blocker, Settings.defaultSettings.copy(debug = cfg.mailDebug))
|
||||||
sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
|
sch <- SchedulerBuilder(cfg.scheduler, blocker, store)
|
||||||
@ -97,14 +99,14 @@ object JoexAppImpl {
|
|||||||
.withTask(
|
.withTask(
|
||||||
JobTask.json(
|
JobTask.json(
|
||||||
ProcessItemArgs.taskName,
|
ProcessItemArgs.taskName,
|
||||||
ItemHandler.newItem[F](cfg, itemOps, fts, analyser),
|
ItemHandler.newItem[F](cfg, itemOps, fts, analyser, regexNer),
|
||||||
ItemHandler.onCancel[F]
|
ItemHandler.onCancel[F]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.withTask(
|
.withTask(
|
||||||
JobTask.json(
|
JobTask.json(
|
||||||
ReProcessItemArgs.taskName,
|
ReProcessItemArgs.taskName,
|
||||||
ReProcessItem[F](cfg, fts, analyser),
|
ReProcessItem[F](cfg, fts, analyser, regexNer),
|
||||||
ReProcessItem.onCancel[F]
|
ReProcessItem.onCancel[F]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -0,0 +1,99 @@
|
|||||||
|
package docspell.joex.analysis
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.analysis.split.TextSplitter
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.store.queries.QCollective
|
||||||
|
|
||||||
|
import io.circe.generic.semiauto._
|
||||||
|
import io.circe.{Decoder, Encoder}
|
||||||
|
|
||||||
|
case class NerFile(collective: Ident, updated: Timestamp, creation: Timestamp) {
|
||||||
|
def nerFilePath(directory: Path): Path =
|
||||||
|
NerFile.nerFilePath(directory, collective)
|
||||||
|
|
||||||
|
def jsonFilePath(directory: Path) =
|
||||||
|
NerFile.jsonFilePath(directory, collective)
|
||||||
|
}
|
||||||
|
|
||||||
|
object NerFile {
|
||||||
|
implicit val jsonDecoder: Decoder[NerFile] =
|
||||||
|
deriveDecoder[NerFile]
|
||||||
|
|
||||||
|
implicit val jsonEncoder: Encoder[NerFile] =
|
||||||
|
deriveEncoder[NerFile]
|
||||||
|
|
||||||
|
private def nerFilePath(directory: Path, collective: Ident): Path =
|
||||||
|
directory.resolve(s"${collective.id}.txt")
|
||||||
|
|
||||||
|
private def jsonFilePath(directory: Path, collective: Ident): Path =
|
||||||
|
directory.resolve(s"${collective.id}.json")
|
||||||
|
|
||||||
|
def find[F[_]: Sync: ContextShift](
|
||||||
|
collective: Ident,
|
||||||
|
directory: Path,
|
||||||
|
blocker: Blocker
|
||||||
|
): F[Option[NerFile]] = {
|
||||||
|
val file = jsonFilePath(directory, collective)
|
||||||
|
File.existsNonEmpty[F](file).flatMap {
|
||||||
|
case true =>
|
||||||
|
File
|
||||||
|
.readJson[F, NerFile](file, blocker)
|
||||||
|
.map(_.some)
|
||||||
|
case false =>
|
||||||
|
(None: Option[NerFile]).pure[F]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def mkNerConfig(names: QCollective.Names): String = {
|
||||||
|
val orgs = names.org
|
||||||
|
.flatMap(Pattern(3))
|
||||||
|
.distinct
|
||||||
|
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
|
||||||
|
|
||||||
|
val pers =
|
||||||
|
names.pers
|
||||||
|
.flatMap(Pattern(2))
|
||||||
|
.distinct
|
||||||
|
.map(_.toRow("PERSON", "LOCATION,MISC"))
|
||||||
|
|
||||||
|
val equips =
|
||||||
|
names.equip
|
||||||
|
.flatMap(Pattern(1))
|
||||||
|
.distinct
|
||||||
|
.map(_.toRow("MISC", "LOCATION"))
|
||||||
|
|
||||||
|
(orgs ++ pers ++ equips).mkString("\n")
|
||||||
|
}
|
||||||
|
case class Pattern(value: String, weight: Int) {
|
||||||
|
def toRow(tag: String, overrideTags: String): String =
|
||||||
|
s"$value\t$tag\t$overrideTags\t$weight"
|
||||||
|
}
|
||||||
|
|
||||||
|
object Pattern {
|
||||||
|
def apply(weight: Int)(str: String): Vector[Pattern] = {
|
||||||
|
val delims = " \t\n\r".toSet
|
||||||
|
val words =
|
||||||
|
TextSplitter
|
||||||
|
.split(str, delims)
|
||||||
|
.map(_.toLower.value.trim)
|
||||||
|
.filter(_.nonEmpty)
|
||||||
|
.toVector
|
||||||
|
.map(w => s"(?i)${w}")
|
||||||
|
val tokens =
|
||||||
|
TextSplitter
|
||||||
|
.splitToken(str, delims)
|
||||||
|
.map(_.toLower.value.trim)
|
||||||
|
.filter(_.nonEmpty)
|
||||||
|
.toVector
|
||||||
|
.take(3)
|
||||||
|
.map(w => s"(?i)${w}")
|
||||||
|
|
||||||
|
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,164 @@
|
|||||||
|
package docspell.joex.analysis
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.effect.concurrent.Semaphore
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.common.syntax.all._
|
||||||
|
import docspell.store.Store
|
||||||
|
import docspell.store.queries.QCollective
|
||||||
|
import docspell.store.records.REquipment
|
||||||
|
import docspell.store.records.ROrganization
|
||||||
|
import docspell.store.records.RPerson
|
||||||
|
|
||||||
|
import io.circe.syntax._
|
||||||
|
import org.log4s.getLogger
|
||||||
|
|
||||||
|
/** Maintains a custom regex-ner file per collective for stanford's
|
||||||
|
* regexner annotator.
|
||||||
|
*/
|
||||||
|
trait RegexNerFile[F[_]] {
|
||||||
|
|
||||||
|
def makeFile(collective: Ident): F[Option[Path]]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object RegexNerFile {
|
||||||
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
|
case class Config(enabled: Boolean, directory: Path, minTime: Duration)
|
||||||
|
|
||||||
|
def apply[F[_]: Concurrent: ContextShift](
|
||||||
|
cfg: Config,
|
||||||
|
blocker: Blocker,
|
||||||
|
store: Store[F]
|
||||||
|
): Resource[F, RegexNerFile[F]] =
|
||||||
|
for {
|
||||||
|
dir <- File.withTempDir[F](cfg.directory, "regexner-")
|
||||||
|
writer <- Resource.liftF(Semaphore(1))
|
||||||
|
} yield new Impl[F](cfg.copy(directory = dir), blocker, store, writer)
|
||||||
|
|
||||||
|
final private class Impl[F[_]: Concurrent: ContextShift](
|
||||||
|
cfg: Config,
|
||||||
|
blocker: Blocker,
|
||||||
|
store: Store[F],
|
||||||
|
writer: Semaphore[F] //TODO allow parallelism per collective
|
||||||
|
) extends RegexNerFile[F] {
|
||||||
|
|
||||||
|
def makeFile(collective: Ident): F[Option[Path]] =
|
||||||
|
if (cfg.enabled) doMakeFile(collective)
|
||||||
|
else (None: Option[Path]).pure[F]
|
||||||
|
|
||||||
|
def doMakeFile(collective: Ident): F[Option[Path]] =
|
||||||
|
for {
|
||||||
|
now <- Timestamp.current[F]
|
||||||
|
existing <- NerFile.find[F](collective, cfg.directory, blocker)
|
||||||
|
result <- existing match {
|
||||||
|
case Some(nf) =>
|
||||||
|
val dur = Duration.between(nf.creation, now)
|
||||||
|
if (dur > cfg.minTime)
|
||||||
|
logger.fdebug(
|
||||||
|
s"Cache time elapsed (${dur} > ${cfg.minTime}). Check for new state."
|
||||||
|
) *> updateFile(
|
||||||
|
collective,
|
||||||
|
now,
|
||||||
|
Some(nf)
|
||||||
|
)
|
||||||
|
else nf.nerFilePath(cfg.directory).some.pure[F]
|
||||||
|
case None =>
|
||||||
|
updateFile(collective, now, None)
|
||||||
|
}
|
||||||
|
} yield result
|
||||||
|
|
||||||
|
private def updateFile(
|
||||||
|
collective: Ident,
|
||||||
|
now: Timestamp,
|
||||||
|
current: Option[NerFile]
|
||||||
|
): F[Option[Path]] =
|
||||||
|
for {
|
||||||
|
lastUpdate <- store.transact(Sql.latestUpdate(collective))
|
||||||
|
result <- lastUpdate match {
|
||||||
|
case None =>
|
||||||
|
(None: Option[Path]).pure[F]
|
||||||
|
case Some(lup) =>
|
||||||
|
current match {
|
||||||
|
case Some(cur) =>
|
||||||
|
val nerf =
|
||||||
|
if (cur.updated == lup)
|
||||||
|
logger.fdebug(s"No state change detected.") *> updateTimestamp(
|
||||||
|
cur,
|
||||||
|
now
|
||||||
|
) *> cur.pure[F]
|
||||||
|
else
|
||||||
|
logger.fdebug(
|
||||||
|
s"There have been state changes for collective '${collective.id}'. Reload NER file."
|
||||||
|
) *> createFile(lup, collective, now)
|
||||||
|
nerf.map(_.nerFilePath(cfg.directory).some)
|
||||||
|
case None =>
|
||||||
|
createFile(lup, collective, now)
|
||||||
|
.map(_.nerFilePath(cfg.directory).some)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} yield result
|
||||||
|
|
||||||
|
private def updateTimestamp(nf: NerFile, now: Timestamp): F[Unit] =
|
||||||
|
writer.withPermit(for {
|
||||||
|
file <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
|
||||||
|
_ <- File.mkDir(file.getParent)
|
||||||
|
_ <- File.writeString(file, nf.copy(creation = now).asJson.spaces2)
|
||||||
|
} yield ())
|
||||||
|
|
||||||
|
private def createFile(
|
||||||
|
lastUpdate: Timestamp,
|
||||||
|
collective: Ident,
|
||||||
|
now: Timestamp
|
||||||
|
): F[NerFile] = {
|
||||||
|
def update(nf: NerFile, text: String): F[Unit] =
|
||||||
|
writer.withPermit(for {
|
||||||
|
jsonFile <- Sync[F].pure(nf.jsonFilePath(cfg.directory))
|
||||||
|
_ <- logger.fdebug(s"Writing custom NER file for collective '${collective.id}'")
|
||||||
|
_ <- File.mkDir(jsonFile.getParent)
|
||||||
|
_ <- File.writeString(nf.nerFilePath(cfg.directory), text)
|
||||||
|
_ <- File.writeString(jsonFile, nf.asJson.spaces2)
|
||||||
|
} yield ())
|
||||||
|
|
||||||
|
for {
|
||||||
|
_ <- logger.finfo(s"Generating custom NER file for collective '${collective.id}'")
|
||||||
|
names <- store.transact(QCollective.allNames(collective))
|
||||||
|
nerFile = NerFile(collective, lastUpdate, now)
|
||||||
|
_ <- update(nerFile, NerFile.mkNerConfig(names))
|
||||||
|
} yield nerFile
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object Sql {
|
||||||
|
import doobie._
|
||||||
|
import doobie.implicits._
|
||||||
|
import docspell.store.impl.Implicits._
|
||||||
|
import docspell.store.impl.Column
|
||||||
|
|
||||||
|
def latestUpdate(collective: Ident): ConnectionIO[Option[Timestamp]] = {
|
||||||
|
def max(col: Column, table: Fragment, cidCol: Column): Fragment =
|
||||||
|
selectSimple(col.max ++ fr"as t", table, cidCol.is(collective))
|
||||||
|
|
||||||
|
val sql =
|
||||||
|
List(
|
||||||
|
max(
|
||||||
|
ROrganization.Columns.updated,
|
||||||
|
ROrganization.table,
|
||||||
|
ROrganization.Columns.cid
|
||||||
|
),
|
||||||
|
max(RPerson.Columns.updated, RPerson.table, RPerson.Columns.cid),
|
||||||
|
max(REquipment.Columns.updated, REquipment.table, REquipment.Columns.cid)
|
||||||
|
)
|
||||||
|
.reduce(_ ++ fr"UNION ALL" ++ _)
|
||||||
|
|
||||||
|
selectSimple(fr"MAX(t)", fr"(" ++ sql ++ fr") as x", Fragment.empty)
|
||||||
|
.query[Timestamp]
|
||||||
|
.option
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -10,6 +10,7 @@ import docspell.backend.ops.OItem
|
|||||||
import docspell.common.{ItemState, ProcessItemArgs}
|
import docspell.common.{ItemState, ProcessItemArgs}
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.queries.QItem
|
import docspell.store.queries.QItem
|
||||||
import docspell.store.records.RItem
|
import docspell.store.records.RItem
|
||||||
@ -31,11 +32,12 @@ object ItemHandler {
|
|||||||
cfg: Config,
|
cfg: Config,
|
||||||
itemOps: OItem[F],
|
itemOps: OItem[F],
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
): Task[F, Args, Unit] =
|
): Task[F, Args, Unit] =
|
||||||
CreateItem[F]
|
CreateItem[F]
|
||||||
.flatMap(itemStateTask(ItemState.Processing))
|
.flatMap(itemStateTask(ItemState.Processing))
|
||||||
.flatMap(safeProcess[F](cfg, itemOps, fts, analyser))
|
.flatMap(safeProcess[F](cfg, itemOps, fts, analyser, regexNer))
|
||||||
.map(_ => ())
|
.map(_ => ())
|
||||||
|
|
||||||
def itemStateTask[F[_]: Sync, A](
|
def itemStateTask[F[_]: Sync, A](
|
||||||
@ -54,11 +56,12 @@ object ItemHandler {
|
|||||||
cfg: Config,
|
cfg: Config,
|
||||||
itemOps: OItem[F],
|
itemOps: OItem[F],
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(data: ItemData): Task[F, Args, ItemData] =
|
)(data: ItemData): Task[F, Args, ItemData] =
|
||||||
isLastRetry[F].flatMap {
|
isLastRetry[F].flatMap {
|
||||||
case true =>
|
case true =>
|
||||||
ProcessItem[F](cfg, itemOps, fts, analyser)(data).attempt.flatMap({
|
ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data).attempt.flatMap({
|
||||||
case Right(d) =>
|
case Right(d) =>
|
||||||
Task.pure(d)
|
Task.pure(d)
|
||||||
case Left(ex) =>
|
case Left(ex) =>
|
||||||
@ -68,7 +71,7 @@ object ItemHandler {
|
|||||||
.andThen(_ => Sync[F].raiseError(ex))
|
.andThen(_ => Sync[F].raiseError(ex))
|
||||||
})
|
})
|
||||||
case false =>
|
case false =>
|
||||||
ProcessItem[F](cfg, itemOps, fts, analyser)(data)
|
ProcessItem[F](cfg, itemOps, fts, analyser, regexNer)(data)
|
||||||
.flatMap(itemStateTask(ItemState.Created))
|
.flatMap(itemStateTask(ItemState.Created))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ import docspell.backend.ops.OItem
|
|||||||
import docspell.common.ProcessItemArgs
|
import docspell.common.ProcessItemArgs
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
|
|
||||||
object ProcessItem {
|
object ProcessItem {
|
||||||
@ -15,11 +16,12 @@ object ProcessItem {
|
|||||||
cfg: Config,
|
cfg: Config,
|
||||||
itemOps: OItem[F],
|
itemOps: OItem[F],
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
ExtractArchive(item)
|
ExtractArchive(item)
|
||||||
.flatMap(Task.setProgress(20))
|
.flatMap(Task.setProgress(20))
|
||||||
.flatMap(processAttachments0(cfg, fts, analyser, (40, 60, 80)))
|
.flatMap(processAttachments0(cfg, fts, analyser, regexNer, (40, 60, 80)))
|
||||||
.flatMap(LinkProposal[F])
|
.flatMap(LinkProposal[F])
|
||||||
.flatMap(SetGivenData[F](itemOps))
|
.flatMap(SetGivenData[F](itemOps))
|
||||||
.flatMap(Task.setProgress(99))
|
.flatMap(Task.setProgress(99))
|
||||||
@ -27,15 +29,17 @@ object ProcessItem {
|
|||||||
def processAttachments[F[_]: ConcurrentEffect: ContextShift](
|
def processAttachments[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
processAttachments0[F](cfg, fts, analyser, (30, 60, 90))(item)
|
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
|
||||||
|
|
||||||
def analysisOnly[F[_]: Sync](
|
def analysisOnly[F[_]: Sync](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
TextAnalysis[F](analyser)(item)
|
TextAnalysis[F](analyser, regexNer)(item)
|
||||||
.flatMap(FindProposal[F](cfg.processing))
|
.flatMap(FindProposal[F](cfg.processing))
|
||||||
.flatMap(EvalProposals[F])
|
.flatMap(EvalProposals[F])
|
||||||
.flatMap(SaveProposals[F])
|
.flatMap(SaveProposals[F])
|
||||||
@ -44,12 +48,13 @@ object ProcessItem {
|
|||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F],
|
||||||
progress: (Int, Int, Int)
|
progress: (Int, Int, Int)
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
ConvertPdf(cfg.convert, item)
|
ConvertPdf(cfg.convert, item)
|
||||||
.flatMap(Task.setProgress(progress._1))
|
.flatMap(Task.setProgress(progress._1))
|
||||||
.flatMap(TextExtraction(cfg.extraction, fts))
|
.flatMap(TextExtraction(cfg.extraction, fts))
|
||||||
.flatMap(Task.setProgress(progress._2))
|
.flatMap(Task.setProgress(progress._2))
|
||||||
.flatMap(analysisOnly[F](cfg, analyser))
|
.flatMap(analysisOnly[F](cfg, analyser, regexNer))
|
||||||
.flatMap(Task.setProgress(progress._3))
|
.flatMap(Task.setProgress(progress._3))
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,7 @@ import docspell.analysis.TextAnalyser
|
|||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.ftsclient.FtsClient
|
import docspell.ftsclient.FtsClient
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.records.RAttachment
|
import docspell.store.records.RAttachment
|
||||||
@ -21,10 +22,11 @@ object ReProcessItem {
|
|||||||
def apply[F[_]: ConcurrentEffect: ContextShift](
|
def apply[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
): Task[F, Args, Unit] =
|
): Task[F, Args, Unit] =
|
||||||
loadItem[F]
|
loadItem[F]
|
||||||
.flatMap(safeProcess[F](cfg, fts, analyser))
|
.flatMap(safeProcess[F](cfg, fts, analyser, regexNer))
|
||||||
.map(_ => ())
|
.map(_ => ())
|
||||||
|
|
||||||
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
|
def onCancel[F[_]: Sync: ContextShift]: Task[F, Args, Unit] =
|
||||||
@ -73,6 +75,7 @@ object ReProcessItem {
|
|||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F],
|
||||||
data: ItemData
|
data: ItemData
|
||||||
): Task[F, Args, ItemData] = {
|
): Task[F, Args, ItemData] = {
|
||||||
|
|
||||||
@ -94,7 +97,7 @@ object ReProcessItem {
|
|||||||
|
|
||||||
getLanguage[F].flatMap { lang =>
|
getLanguage[F].flatMap { lang =>
|
||||||
ProcessItem
|
ProcessItem
|
||||||
.processAttachments[F](cfg, fts, analyser)(data)
|
.processAttachments[F](cfg, fts, analyser, regexNer)(data)
|
||||||
.contramap[Args](convertArgs(lang))
|
.contramap[Args](convertArgs(lang))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -113,11 +116,12 @@ object ReProcessItem {
|
|||||||
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
|
def safeProcess[F[_]: ConcurrentEffect: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
fts: FtsClient[F],
|
fts: FtsClient[F],
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F],
|
||||||
|
regexNer: RegexNerFile[F]
|
||||||
)(data: ItemData): Task[F, Args, ItemData] =
|
)(data: ItemData): Task[F, Args, ItemData] =
|
||||||
isLastRetry[F].flatMap {
|
isLastRetry[F].flatMap {
|
||||||
case true =>
|
case true =>
|
||||||
processFiles[F](cfg, fts, analyser, data).attempt
|
processFiles[F](cfg, fts, analyser, regexNer, data).attempt
|
||||||
.flatMap({
|
.flatMap({
|
||||||
case Right(d) =>
|
case Right(d) =>
|
||||||
Task.pure(d)
|
Task.pure(d)
|
||||||
@ -127,7 +131,7 @@ object ReProcessItem {
|
|||||||
).andThen(_ => Sync[F].raiseError(ex))
|
).andThen(_ => Sync[F].raiseError(ex))
|
||||||
})
|
})
|
||||||
case false =>
|
case false =>
|
||||||
processFiles[F](cfg, fts, analyser, data)
|
processFiles[F](cfg, fts, analyser, regexNer, data)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
|
private def logWarn[F[_]](msg: => String): Task[F, Args, Unit] =
|
||||||
|
@ -1,24 +1,22 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
import java.nio.file.Paths
|
|
||||||
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.analysis.TextAnalyser
|
import docspell.analysis.TextAnalyser
|
||||||
import docspell.analysis.nlp.StanfordSettings
|
import docspell.analysis.nlp.StanfordSettings
|
||||||
import docspell.analysis.split.TextSplitter
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.process.ItemData.AttachmentDates
|
import docspell.joex.process.ItemData.AttachmentDates
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.queries.QCollective
|
|
||||||
import docspell.store.records.RAttachmentMeta
|
import docspell.store.records.RAttachmentMeta
|
||||||
|
|
||||||
object TextAnalysis {
|
object TextAnalysis {
|
||||||
|
|
||||||
def apply[F[_]: Sync](
|
def apply[F[_]: Sync](
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F],
|
||||||
|
nerFile: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
for {
|
for {
|
||||||
@ -27,7 +25,7 @@ object TextAnalysis {
|
|||||||
t <-
|
t <-
|
||||||
item.metas.toList
|
item.metas.toList
|
||||||
.traverse(
|
.traverse(
|
||||||
annotateAttachment[F](ctx, analyser)
|
annotateAttachment[F](ctx, analyser, nerFile)
|
||||||
)
|
)
|
||||||
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
|
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
|
||||||
_ <- t.traverse(m =>
|
_ <- t.traverse(m =>
|
||||||
@ -41,63 +39,19 @@ object TextAnalysis {
|
|||||||
|
|
||||||
def annotateAttachment[F[_]: Sync](
|
def annotateAttachment[F[_]: Sync](
|
||||||
ctx: Context[F, ProcessItemArgs],
|
ctx: Context[F, ProcessItemArgs],
|
||||||
analyser: TextAnalyser[F]
|
analyser: TextAnalyser[F],
|
||||||
|
nerFile: RegexNerFile[F]
|
||||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||||
val settings = StanfordSettings(ctx.args.meta.language, false, None)
|
val settings = StanfordSettings(ctx.args.meta.language, false, None)
|
||||||
for {
|
for {
|
||||||
names <- ctx.store.transact(QCollective.allNames(ctx.args.meta.collective))
|
customNer <- nerFile.makeFile(ctx.args.meta.collective)
|
||||||
temp <- File.mkTempFile(Paths.get("."), "textanalysis")
|
sett = settings.copy(regexNer = customNer)
|
||||||
_ <- File.writeString(temp, mkNerConfig(names))
|
|
||||||
sett = settings.copy(regexNer = Some(temp))
|
|
||||||
labels <- analyser.annotate(
|
labels <- analyser.annotate(
|
||||||
ctx.logger,
|
ctx.logger,
|
||||||
sett,
|
sett,
|
||||||
ctx.args.meta.collective,
|
ctx.args.meta.collective,
|
||||||
rm.content.getOrElse("")
|
rm.content.getOrElse("")
|
||||||
)
|
)
|
||||||
_ <- File.deleteFile(temp)
|
|
||||||
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||||
}
|
}
|
||||||
|
|
||||||
def mkNerConfig(names: QCollective.Names): String = {
|
|
||||||
val orgs = names.org
|
|
||||||
.flatMap(Pattern(3))
|
|
||||||
.distinct
|
|
||||||
.map(_.toRow("ORGANIZATION", "LOCATION,PERSON,MISC"))
|
|
||||||
|
|
||||||
val pers =
|
|
||||||
names.pers
|
|
||||||
.flatMap(Pattern(2))
|
|
||||||
.distinct
|
|
||||||
.map(_.toRow("PERSON", "LOCATION,MISC"))
|
|
||||||
|
|
||||||
val equips =
|
|
||||||
names.equip
|
|
||||||
.flatMap(Pattern(1))
|
|
||||||
.distinct
|
|
||||||
.map(_.toRow("MISC", "LOCATION"))
|
|
||||||
|
|
||||||
(orgs ++ pers ++ equips).mkString("\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
case class Pattern(value: String, weight: Int) {
|
|
||||||
def toRow(tag: String, overrideTags: String): String =
|
|
||||||
s"$value\t$tag\t$overrideTags\t$weight"
|
|
||||||
}
|
|
||||||
|
|
||||||
object Pattern {
|
|
||||||
def apply(weight: Int)(str: String): Vector[Pattern] = {
|
|
||||||
val delims = " \t\n\r".toSet
|
|
||||||
val words =
|
|
||||||
TextSplitter.split(str, delims).toVector.map(w => s"(?i)${w.toLower.value}")
|
|
||||||
val tokens =
|
|
||||||
TextSplitter
|
|
||||||
.splitToken(str, delims)
|
|
||||||
.toVector
|
|
||||||
.take(3)
|
|
||||||
.map(w => s"(?i)${w.toLower.value}")
|
|
||||||
|
|
||||||
tokens.map(t => Pattern(t, weight)).prepended(Pattern(words.mkString(" "), weight))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -91,6 +91,11 @@ let
|
|||||||
};
|
};
|
||||||
text-analysis = {
|
text-analysis = {
|
||||||
max-length = 10000;
|
max-length = 10000;
|
||||||
|
regex-ner = {
|
||||||
|
enabled = true;
|
||||||
|
file-cache-time = "1 minute";
|
||||||
|
};
|
||||||
|
working-dir = "/tmp/docspell-analysis";
|
||||||
};
|
};
|
||||||
processing = {
|
processing = {
|
||||||
max-due-date-years = 10;
|
max-due-date-years = 10;
|
||||||
@ -689,7 +694,48 @@ in {
|
|||||||
(a rough guess).
|
(a rough guess).
|
||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
working-dir = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.text-analysis.working-dir;
|
||||||
|
description = ''
|
||||||
|
A working directory for the analyser to store temporary/working
|
||||||
|
files.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
regex-ner = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
enabled = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = defaults.text-analysis.regex-ner.enabled;
|
||||||
|
description = ''
|
||||||
|
Whether to enable custom NER annotation. This uses the address
|
||||||
|
book of a collective as input for NER tagging (to automatically
|
||||||
|
find correspondent and concerned entities). If the address book
|
||||||
|
is large, this can be quite memory intensive and also makes text
|
||||||
|
analysis slower. But it greatly improves accuracy. If this is
|
||||||
|
false, NER tagging uses only statistical models (that also work
|
||||||
|
quite well).
|
||||||
|
|
||||||
|
This setting might be moved to the collective settings in the
|
||||||
|
future.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
file-cache-time = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = defaults.text-analysis.ner-file-cache-time;
|
||||||
|
description = ''
|
||||||
|
The NER annotation uses a file of patterns that is derived from
|
||||||
|
a collective's address book. This is is the time how long this
|
||||||
|
file will be kept until a check for a state change is done.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.text-analysis.regex-ner;
|
||||||
|
description = "";
|
||||||
|
};
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
default = defaults.text-analysis;
|
default = defaults.text-analysis;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user