mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
commit
8cb78e3dbe
@ -7,18 +7,21 @@ import docspell.analysis.contact.Contact
|
||||
import docspell.analysis.date.DateFind
|
||||
import docspell.analysis.nlp.PipelineCache
|
||||
import docspell.analysis.nlp.StanfordNerClassifier
|
||||
import docspell.analysis.nlp.StanfordSettings
|
||||
import docspell.analysis.nlp.StanfordNerSettings
|
||||
import docspell.analysis.nlp.StanfordTextClassifier
|
||||
import docspell.analysis.nlp.TextClassifier
|
||||
import docspell.common._
|
||||
|
||||
trait TextAnalyser[F[_]] {
|
||||
|
||||
def annotate(
|
||||
logger: Logger[F],
|
||||
settings: StanfordSettings,
|
||||
settings: StanfordNerSettings,
|
||||
cacheKey: Ident,
|
||||
text: String
|
||||
): F[TextAnalyser.Result]
|
||||
|
||||
def classifier(blocker: Blocker)(implicit CS: ContextShift[F]): TextClassifier[F]
|
||||
}
|
||||
object TextAnalyser {
|
||||
|
||||
@ -35,7 +38,7 @@ object TextAnalyser {
|
||||
new TextAnalyser[F] {
|
||||
def annotate(
|
||||
logger: Logger[F],
|
||||
settings: StanfordSettings,
|
||||
settings: StanfordNerSettings,
|
||||
cacheKey: Ident,
|
||||
text: String
|
||||
): F[TextAnalyser.Result] =
|
||||
@ -48,6 +51,11 @@ object TextAnalyser {
|
||||
spans = NerLabelSpan.build(list)
|
||||
} yield Result(spans ++ list, dates)
|
||||
|
||||
def classifier(blocker: Blocker)(implicit
|
||||
CS: ContextShift[F]
|
||||
): TextClassifier[F] =
|
||||
new StanfordTextClassifier[F](cfg.classifier, blocker)
|
||||
|
||||
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||
if (text.length <= cfg.maxLength) text.pure[F]
|
||||
else
|
||||
@ -56,7 +64,7 @@ object TextAnalyser {
|
||||
s" Analysing only first ${cfg.maxLength} characters."
|
||||
) *> text.take(cfg.maxLength).pure[F]
|
||||
|
||||
private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
|
||||
private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
|
||||
: F[Vector[NerLabel]] =
|
||||
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
|
||||
|
||||
|
@ -1,5 +1,8 @@
|
||||
package docspell.analysis
|
||||
|
||||
import docspell.analysis.nlp.TextClassifierConfig
|
||||
|
||||
case class TextAnalysisConfig(
|
||||
maxLength: Int
|
||||
maxLength: Int,
|
||||
classifier: TextClassifierConfig
|
||||
)
|
||||
|
@ -0,0 +1,5 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
case class ClassifierModel(model: Path)
|
@ -19,7 +19,7 @@ import org.log4s.getLogger
|
||||
*/
|
||||
trait PipelineCache[F[_]] {
|
||||
|
||||
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
|
||||
def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP]
|
||||
|
||||
}
|
||||
|
||||
@ -28,7 +28,7 @@ object PipelineCache {
|
||||
|
||||
def none[F[_]: Applicative]: PipelineCache[F] =
|
||||
new PipelineCache[F] {
|
||||
def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
|
||||
def obtain(ignored: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
|
||||
makeClassifier(settings).pure[F]
|
||||
}
|
||||
|
||||
@ -38,7 +38,7 @@ object PipelineCache {
|
||||
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
|
||||
extends PipelineCache[F] {
|
||||
|
||||
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
|
||||
def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
|
||||
for {
|
||||
id <- makeSettingsId(settings)
|
||||
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
|
||||
@ -48,7 +48,7 @@ object PipelineCache {
|
||||
key: String,
|
||||
id: String,
|
||||
cache: Map[String, Entry],
|
||||
settings: StanfordSettings
|
||||
settings: StanfordNerSettings
|
||||
): (Map[String, Entry], StanfordCoreNLP) =
|
||||
cache.get(key) match {
|
||||
case Some(entry) =>
|
||||
@ -68,7 +68,7 @@ object PipelineCache {
|
||||
(cache.updated(key, e), nlp)
|
||||
}
|
||||
|
||||
private def makeSettingsId(settings: StanfordSettings): F[String] = {
|
||||
private def makeSettingsId(settings: StanfordNerSettings): F[String] = {
|
||||
val base = settings.copy(regexNer = None).toString
|
||||
val size: F[Long] =
|
||||
settings.regexNer match {
|
||||
@ -81,7 +81,7 @@ object PipelineCache {
|
||||
}
|
||||
|
||||
}
|
||||
private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
|
||||
private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = {
|
||||
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
|
||||
new StanfordCoreNLP(Properties.forSettings(settings))
|
||||
}
|
||||
|
@ -7,6 +7,9 @@ import docspell.common._
|
||||
|
||||
object Properties {
|
||||
|
||||
def fromMap(m: Map[String, String]): JProps =
|
||||
apply(m.toSeq: _*)
|
||||
|
||||
def apply(ps: (String, String)*): JProps = {
|
||||
val p = new JProps()
|
||||
for ((k, v) <- ps)
|
||||
@ -14,7 +17,7 @@ object Properties {
|
||||
p
|
||||
}
|
||||
|
||||
def forSettings(settings: StanfordSettings): JProps = {
|
||||
def forSettings(settings: StanfordNerSettings): JProps = {
|
||||
val regexNerFile = settings.regexNer
|
||||
.map(p => p.normalize().toAbsolutePath().toString())
|
||||
settings.lang match {
|
||||
|
@ -25,7 +25,7 @@ object StanfordNerClassifier {
|
||||
def nerAnnotate[F[_]: Applicative](
|
||||
cacheKey: String,
|
||||
cache: PipelineCache[F]
|
||||
)(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
|
||||
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
||||
cache
|
||||
.obtain(cacheKey, settings)
|
||||
.map(crf => runClassifier(crf, text))
|
||||
|
@ -19,4 +19,8 @@ import docspell.common._
|
||||
* as a last step to tag untagged tokens using the provided list of
|
||||
* regexps.
|
||||
*/
|
||||
case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
|
||||
case class StanfordNerSettings(
|
||||
lang: Language,
|
||||
highRecall: Boolean,
|
||||
regexNer: Option[Path]
|
||||
)
|
@ -0,0 +1,153 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.effect._
|
||||
import cats.effect.concurrent.Ref
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.analysis.nlp.TextClassifier._
|
||||
import docspell.common._
|
||||
|
||||
import edu.stanford.nlp.classify.ColumnDataClassifier
|
||||
|
||||
final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||
cfg: TextClassifierConfig,
|
||||
blocker: Blocker
|
||||
) extends TextClassifier[F] {
|
||||
|
||||
def trainClassifier[A](
|
||||
logger: Logger[F],
|
||||
data: Stream[F, Data]
|
||||
)(handler: TextClassifier.Handler[F, A]): F[A] =
|
||||
File
|
||||
.withTempDir(cfg.workingDir, "trainclassifier")
|
||||
.use { dir =>
|
||||
for {
|
||||
rawData <- writeDataFile(blocker, dir, data)
|
||||
_ <- logger.info(s"Learning from ${rawData.count} items.")
|
||||
trainData <- splitData(logger, rawData)
|
||||
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
|
||||
sorted = scores.sortBy(-_.score)
|
||||
res <- handler(sorted.head.model)
|
||||
} yield res
|
||||
}
|
||||
|
||||
def classify(
|
||||
logger: Logger[F],
|
||||
model: ClassifierModel,
|
||||
text: String
|
||||
): F[Option[String]] =
|
||||
Sync[F].delay {
|
||||
val cls = ColumnDataClassifier.getClassifier(
|
||||
model.model.normalize().toAbsolutePath().toString()
|
||||
)
|
||||
val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
|
||||
Option(cat)
|
||||
}
|
||||
|
||||
// --- helpers
|
||||
|
||||
def train(
|
||||
logger: Logger[F],
|
||||
in: TrainData,
|
||||
props: Map[String, String]
|
||||
): F[TrainResult] =
|
||||
for {
|
||||
_ <- logger.debug(s"Training classifier from $props")
|
||||
res <- Sync[F].delay {
|
||||
val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
|
||||
cdc.trainClassifier(in.train.toString())
|
||||
val score = cdc.testClassifier(in.test.toString())
|
||||
TrainResult(score.first(), ClassifierModel(in.modelFile))
|
||||
}
|
||||
_ <- logger.debug(s"Trained with result $res")
|
||||
} yield res
|
||||
|
||||
def splitData(logger: Logger[F], in: RawData): F[TrainData] = {
|
||||
val nTest = (in.count * 0.15).toLong
|
||||
|
||||
val td =
|
||||
TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt"))
|
||||
|
||||
val fileLines =
|
||||
fs2.io.file
|
||||
.readAll(in.file, blocker, 4096)
|
||||
.through(fs2.text.utf8Decode)
|
||||
.through(fs2.text.lines)
|
||||
|
||||
for {
|
||||
_ <- logger.debug(
|
||||
s"Splitting raw data into test/train data. Testing with $nTest entries"
|
||||
)
|
||||
_ <-
|
||||
fileLines
|
||||
.take(nTest)
|
||||
.intersperse("\n")
|
||||
.through(fs2.text.utf8Encode)
|
||||
.through(fs2.io.file.writeAll(td.test, blocker))
|
||||
.compile
|
||||
.drain
|
||||
_ <-
|
||||
fileLines
|
||||
.drop(nTest)
|
||||
.intersperse("\n")
|
||||
.through(fs2.text.utf8Encode)
|
||||
.through(fs2.io.file.writeAll(td.train, blocker))
|
||||
.compile
|
||||
.drain
|
||||
} yield td
|
||||
}
|
||||
|
||||
def writeDataFile(blocker: Blocker, dir: Path, data: Stream[F, Data]): F[RawData] = {
|
||||
val target = dir.resolve("rawdata")
|
||||
for {
|
||||
counter <- Ref.of[F, Long](0L)
|
||||
_ <-
|
||||
data
|
||||
.filter(_.text.nonEmpty)
|
||||
.map(d => s"${d.cls}\t${fixRef(d.ref)}\t${normalisedText(d.text)}")
|
||||
.evalTap(_ => counter.update(_ + 1))
|
||||
.intersperse("\r\n")
|
||||
.through(fs2.text.utf8Encode)
|
||||
.through(fs2.io.file.writeAll(target, blocker))
|
||||
.compile
|
||||
.drain
|
||||
lines <- counter.get
|
||||
} yield RawData(lines, target)
|
||||
|
||||
}
|
||||
|
||||
def normalisedText(text: String): String =
|
||||
text.replaceAll("[\n\r\t]+", " ")
|
||||
|
||||
def fixRef(str: String): String =
|
||||
str.replace('\t', '_')
|
||||
|
||||
def amendProps(
|
||||
trainData: TrainData,
|
||||
props: Map[String, String]
|
||||
): Map[String, String] =
|
||||
prepend("2.", props) ++ Map(
|
||||
"trainFile" -> trainData.train.normalize().toAbsolutePath().toString(),
|
||||
"testFile" -> trainData.test.normalize().toAbsolutePath().toString(),
|
||||
"serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString()
|
||||
).toList
|
||||
|
||||
case class RawData(count: Long, file: Path)
|
||||
case class TrainData(train: Path, test: Path) {
|
||||
val modelFile = train.resolveSibling("model.ser.gz")
|
||||
}
|
||||
|
||||
case class TrainResult(score: Double, model: ClassifierModel)
|
||||
|
||||
def prepend(pre: String, data: Map[String, String]): Map[String, String] =
|
||||
data.toList
|
||||
.map({
|
||||
case (k, v) =>
|
||||
if (k.startsWith(pre)) (k, v)
|
||||
else (pre + k, v)
|
||||
})
|
||||
.toMap
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import cats.data.Kleisli
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.analysis.nlp.TextClassifier.Data
|
||||
import docspell.common._
|
||||
|
||||
trait TextClassifier[F[_]] {
|
||||
|
||||
def trainClassifier[A](logger: Logger[F], data: Stream[F, Data])(
|
||||
handler: TextClassifier.Handler[F, A]
|
||||
): F[A]
|
||||
|
||||
def classify(logger: Logger[F], model: ClassifierModel, text: String): F[Option[String]]
|
||||
|
||||
}
|
||||
|
||||
object TextClassifier {
|
||||
|
||||
type Handler[F[_], A] = Kleisli[F, ClassifierModel, A]
|
||||
|
||||
case class Data(cls: String, ref: String, text: String)
|
||||
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.data.NonEmptyList
|
||||
|
||||
case class TextClassifierConfig(
|
||||
workingDir: Path,
|
||||
classifierConfigs: NonEmptyList[Map[String, String]]
|
||||
)
|
BIN
modules/analysis/src/test/resources/test.ser.gz
Normal file
BIN
modules/analysis/src/test/resources/test.ser.gz
Normal file
Binary file not shown.
@ -0,0 +1,76 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import minitest._
|
||||
import cats.effect._
|
||||
import scala.concurrent.ExecutionContext
|
||||
import java.nio.file.Paths
|
||||
import cats.data.NonEmptyList
|
||||
import docspell.common._
|
||||
import fs2.Stream
|
||||
import cats.data.Kleisli
|
||||
import TextClassifier.Data
|
||||
|
||||
object StanfordTextClassifierSuite extends SimpleTestSuite {
|
||||
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
test("learn from data") {
|
||||
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
|
||||
|
||||
val data =
|
||||
Stream
|
||||
.emit(Data("invoice", "n", "this is your invoice total $421"))
|
||||
.repeat
|
||||
.take(10)
|
||||
.zip(
|
||||
Stream
|
||||
.emit(Data("receipt", "n", "shopping receipt cheese cake bar"))
|
||||
.repeat
|
||||
.take(10)
|
||||
)
|
||||
.flatMap({
|
||||
case (a, b) =>
|
||||
Stream.emits(Seq(a, b))
|
||||
})
|
||||
.covary[IO]
|
||||
|
||||
val modelExists =
|
||||
Blocker[IO].use { blocker =>
|
||||
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
|
||||
classifier.trainClassifier[Boolean](logger, data)(
|
||||
Kleisli(result => File.existsNonEmpty[IO](result.model))
|
||||
)
|
||||
}
|
||||
assertEquals(modelExists.unsafeRunSync(), true)
|
||||
}
|
||||
|
||||
test("run classifier") {
|
||||
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
|
||||
val things = for {
|
||||
dir <- File.withTempDir[IO](Paths.get("target"), "testcls")
|
||||
blocker <- Blocker[IO]
|
||||
} yield (dir, blocker)
|
||||
|
||||
things
|
||||
.use {
|
||||
case (dir, blocker) =>
|
||||
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
|
||||
|
||||
val modelFile = dir.resolve("test.ser.gz")
|
||||
for {
|
||||
_ <-
|
||||
LenientUri
|
||||
.fromJava(getClass.getResource("/test.ser.gz"))
|
||||
.readURL[IO](4096, blocker)
|
||||
.through(fs2.io.file.writeAll(modelFile, blocker))
|
||||
.compile
|
||||
.drain
|
||||
model = ClassifierModel(modelFile)
|
||||
cat <- classifier.classify(logger, model, "there is receipt always")
|
||||
_ = assertEquals(cat, Some("receipt"))
|
||||
} yield ()
|
||||
}
|
||||
.unsafeRunSync()
|
||||
}
|
||||
}
|
@ -52,12 +52,12 @@ object BackendApp {
|
||||
queue <- JobQueue(store)
|
||||
loginImpl <- Login[F](store)
|
||||
signupImpl <- OSignup[F](store)
|
||||
collImpl <- OCollective[F](store)
|
||||
joexImpl <- OJoex(JoexClient(httpClient), store)
|
||||
collImpl <- OCollective[F](store, utStore, queue, joexImpl)
|
||||
sourceImpl <- OSource[F](store)
|
||||
tagImpl <- OTag[F](store)
|
||||
equipImpl <- OEquipment[F](store)
|
||||
orgImpl <- OOrganization(store)
|
||||
joexImpl <- OJoex(JoexClient(httpClient), store)
|
||||
uploadImpl <- OUpload(store, queue, cfg.files, joexImpl)
|
||||
nodeImpl <- ONode(store)
|
||||
jobImpl <- OJob(store, joexImpl)
|
||||
|
@ -8,14 +8,21 @@ import docspell.backend.PasswordCrypt
|
||||
import docspell.backend.ops.OCollective._
|
||||
import docspell.common._
|
||||
import docspell.store.queries.QCollective
|
||||
import docspell.store.queue.JobQueue
|
||||
import docspell.store.records._
|
||||
import docspell.store.usertask.UserTask
|
||||
import docspell.store.usertask.UserTaskStore
|
||||
import docspell.store.{AddResult, Store}
|
||||
|
||||
import com.github.eikek.calev._
|
||||
|
||||
trait OCollective[F[_]] {
|
||||
|
||||
def find(name: Ident): F[Option[RCollective]]
|
||||
|
||||
def updateSettings(collective: Ident, lang: OCollective.Settings): F[AddResult]
|
||||
def updateSettings(collective: Ident, settings: OCollective.Settings): F[AddResult]
|
||||
|
||||
def findSettings(collective: Ident): F[Option[OCollective.Settings]]
|
||||
|
||||
def listUser(collective: Ident): F[Vector[RUser]]
|
||||
|
||||
@ -43,6 +50,7 @@ trait OCollective[F[_]] {
|
||||
|
||||
def findEnabledSource(sourceId: Ident): F[Option[RSource]]
|
||||
|
||||
def startLearnClassifier(collective: Ident): F[Unit]
|
||||
}
|
||||
|
||||
object OCollective {
|
||||
@ -55,6 +63,8 @@ object OCollective {
|
||||
|
||||
type Settings = RCollective.Settings
|
||||
val Settings = RCollective.Settings
|
||||
type Classifier = RClassifierSetting.Classifier
|
||||
val Classifier = RClassifierSetting.Classifier
|
||||
|
||||
sealed trait PassChangeResult
|
||||
object PassChangeResult {
|
||||
@ -91,7 +101,12 @@ object OCollective {
|
||||
}
|
||||
}
|
||||
|
||||
def apply[F[_]: Effect](store: Store[F]): Resource[F, OCollective[F]] =
|
||||
def apply[F[_]: Effect](
|
||||
store: Store[F],
|
||||
uts: UserTaskStore[F],
|
||||
queue: JobQueue[F],
|
||||
joex: OJoex[F]
|
||||
): Resource[F, OCollective[F]] =
|
||||
Resource.pure[F, OCollective[F]](new OCollective[F] {
|
||||
def find(name: Ident): F[Option[RCollective]] =
|
||||
store.transact(RCollective.findById(name))
|
||||
@ -101,6 +116,41 @@ object OCollective {
|
||||
.transact(RCollective.updateSettings(collective, sett))
|
||||
.attempt
|
||||
.map(AddResult.fromUpdate)
|
||||
.flatMap(res => updateLearnClassifierTask(collective, sett) *> res.pure[F])
|
||||
|
||||
def updateLearnClassifierTask(coll: Ident, sett: Settings) =
|
||||
for {
|
||||
id <- Ident.randomId[F]
|
||||
on = sett.classifier.map(_.enabled).getOrElse(false)
|
||||
timer = sett.classifier.map(_.schedule).getOrElse(CalEvent.unsafe(""))
|
||||
ut = UserTask(
|
||||
id,
|
||||
LearnClassifierArgs.taskName,
|
||||
on,
|
||||
timer,
|
||||
LearnClassifierArgs(coll)
|
||||
)
|
||||
_ <- uts.updateOneTask(AccountId(coll, LearnClassifierArgs.taskName), ut)
|
||||
_ <- joex.notifyAllNodes
|
||||
} yield ()
|
||||
|
||||
def startLearnClassifier(collective: Ident): F[Unit] =
|
||||
for {
|
||||
id <- Ident.randomId[F]
|
||||
ut <- UserTask(
|
||||
id,
|
||||
LearnClassifierArgs.taskName,
|
||||
true,
|
||||
CalEvent(WeekdayComponent.All, DateEvent.All, TimeEvent.All),
|
||||
LearnClassifierArgs(collective)
|
||||
).encode.toPeriodicTask(AccountId(collective, LearnClassifierArgs.taskName))
|
||||
job <- ut.toJob
|
||||
_ <- queue.insert(job)
|
||||
_ <- joex.notifyAllNodes
|
||||
} yield ()
|
||||
|
||||
def findSettings(collective: Ident): F[Option[OCollective.Settings]] =
|
||||
store.transact(RCollective.getSettings(collective))
|
||||
|
||||
def listUser(collective: Ident): F[Vector[RUser]] =
|
||||
store.transact(RUser.findAll(collective, _.login))
|
||||
|
@ -0,0 +1,35 @@
|
||||
package docspell.common
|
||||
|
||||
import docspell.common.syntax.all._
|
||||
|
||||
import io.circe._
|
||||
import io.circe.generic.semiauto._
|
||||
|
||||
/** Arguments to the classify-item task.
|
||||
*
|
||||
* This task is run periodically and learns from existing documents
|
||||
* to create a model for predicting tags of new documents. The user
|
||||
* must give a tag category as a subset of possible tags..
|
||||
*/
|
||||
case class LearnClassifierArgs(
|
||||
collective: Ident
|
||||
) {
|
||||
|
||||
def makeSubject: String =
|
||||
"Learn tags"
|
||||
|
||||
}
|
||||
|
||||
object LearnClassifierArgs {
|
||||
|
||||
val taskName = Ident.unsafe("learn-classifier")
|
||||
|
||||
implicit val jsonEncoder: Encoder[LearnClassifierArgs] =
|
||||
deriveEncoder[LearnClassifierArgs]
|
||||
implicit val jsonDecoder: Decoder[LearnClassifierArgs] =
|
||||
deriveDecoder[LearnClassifierArgs]
|
||||
|
||||
def parse(str: String): Either[Throwable, LearnClassifierArgs] =
|
||||
str.parseJsonAs[LearnClassifierArgs]
|
||||
|
||||
}
|
@ -271,6 +271,50 @@ docspell.joex {
|
||||
# file will be kept until a check for a state change is done.
|
||||
file-cache-time = "1 minute"
|
||||
}
|
||||
|
||||
# Settings for doing document classification.
|
||||
#
|
||||
# This works by learning from existing documents. A collective can
|
||||
# specify a tag category and the system will try to predict a tag
|
||||
# from this category for new incoming documents.
|
||||
#
|
||||
# This requires a satstical model that is computed from all
|
||||
# existing documents. This process is run periodically as
|
||||
# configured by the collective. It may require a lot of memory,
|
||||
# depending on the amount of data.
|
||||
#
|
||||
# It utilises this NLP library: https://nlp.stanford.edu/.
|
||||
classification {
|
||||
# Whether to enable classification globally. Each collective can
|
||||
# decide to disable it. If it is disabled here, no collective
|
||||
# can use classification.
|
||||
enabled = true
|
||||
|
||||
# If concerned with memory consumption, this restricts the
|
||||
# number of items to consider. More are better for training. A
|
||||
# negative value or zero means no train on all items.
|
||||
item-count = 0
|
||||
|
||||
# These settings are used to configure the classifier. If
|
||||
# multiple are given, they are all tried and the "best" is
|
||||
# chosen at the end. See
|
||||
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
|
||||
# for more info about these settings. The settings here yielded
|
||||
# good results with *my* dataset.
|
||||
#
|
||||
# Enclose regexps in triple quotes.
|
||||
classifiers = [
|
||||
{ "useSplitWords" = "true"
|
||||
"splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
|
||||
"splitWordsIgnoreRegexp" = """\s+"""
|
||||
"useSplitPrefixSuffixNGrams" = "true"
|
||||
"maxNGramLeng" = "4"
|
||||
"minNGramLeng" = "1"
|
||||
"splitWordShape" = "chris4"
|
||||
"intern" = "true" # makes it slower but saves memory
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
# Configuration for converting files into PDFs.
|
||||
|
@ -2,7 +2,10 @@ package docspell.joex
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
import cats.data.NonEmptyList
|
||||
|
||||
import docspell.analysis.TextAnalysisConfig
|
||||
import docspell.analysis.nlp.TextClassifierConfig
|
||||
import docspell.backend.Config.Files
|
||||
import docspell.common._
|
||||
import docspell.convert.ConvertConfig
|
||||
@ -57,15 +60,30 @@ object Config {
|
||||
case class TextAnalysis(
|
||||
maxLength: Int,
|
||||
workingDir: Path,
|
||||
regexNer: RegexNer
|
||||
regexNer: RegexNer,
|
||||
classification: Classification
|
||||
) {
|
||||
|
||||
def textAnalysisConfig: TextAnalysisConfig =
|
||||
TextAnalysisConfig(maxLength)
|
||||
TextAnalysisConfig(
|
||||
maxLength,
|
||||
TextClassifierConfig(
|
||||
workingDir,
|
||||
NonEmptyList
|
||||
.fromList(classification.classifiers)
|
||||
.getOrElse(NonEmptyList.of(Map.empty))
|
||||
)
|
||||
)
|
||||
|
||||
def regexNerFileConfig: RegexNerFile.Config =
|
||||
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
|
||||
}
|
||||
|
||||
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
|
||||
|
||||
case class Classification(
|
||||
enabled: Boolean,
|
||||
itemCount: Int,
|
||||
classifiers: List[Map[String, String]]
|
||||
)
|
||||
}
|
||||
|
@ -14,6 +14,7 @@ import docspell.ftssolr.SolrFtsClient
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
import docspell.joex.fts.{MigrationTask, ReIndexTask}
|
||||
import docspell.joex.hk._
|
||||
import docspell.joex.learn.LearnClassifierTask
|
||||
import docspell.joex.notify._
|
||||
import docspell.joex.pdfconv.ConvertAllPdfTask
|
||||
import docspell.joex.pdfconv.PdfConvTask
|
||||
@ -159,6 +160,13 @@ object JoexAppImpl {
|
||||
ConvertAllPdfTask.onCancel[F]
|
||||
)
|
||||
)
|
||||
.withTask(
|
||||
JobTask.json(
|
||||
LearnClassifierArgs.taskName,
|
||||
LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser),
|
||||
LearnClassifierTask.onCancel[F]
|
||||
)
|
||||
)
|
||||
.resource
|
||||
psch <- PeriodicScheduler.create(
|
||||
cfg.periodicScheduler,
|
||||
|
@ -0,0 +1,111 @@
|
||||
package docspell.joex.learn
|
||||
|
||||
import cats.data.Kleisli
|
||||
import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
import fs2.{Pipe, Stream}
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.analysis.nlp.ClassifierModel
|
||||
import docspell.analysis.nlp.TextClassifier.Data
|
||||
import docspell.backend.ops.OCollective
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.scheduler._
|
||||
import docspell.store.queries.QItem
|
||||
import docspell.store.records.RClassifierSetting
|
||||
|
||||
import bitpeace.MimetypeHint
|
||||
|
||||
object LearnClassifierTask {
|
||||
val noClass = "__NONE__"
|
||||
val pageSep = " --n-- "
|
||||
|
||||
type Args = LearnClassifierArgs
|
||||
|
||||
def onCancel[F[_]: Sync]: Task[F, Args, Unit] =
|
||||
Task.log(_.warn("Cancelling learn-classifier task"))
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
cfg: Config.TextAnalysis,
|
||||
blocker: Blocker,
|
||||
analyser: TextAnalyser[F]
|
||||
): Task[F, Args, Unit] =
|
||||
Task { ctx =>
|
||||
(for {
|
||||
sett <- findActiveSettings[F](ctx, cfg)
|
||||
data = selectItems(
|
||||
ctx,
|
||||
math.min(cfg.classification.itemCount, sett.itemCount).toLong,
|
||||
sett.category.getOrElse("")
|
||||
)
|
||||
_ <- OptionT.liftF(
|
||||
analyser
|
||||
.classifier(blocker)
|
||||
.trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker)))
|
||||
)
|
||||
} yield ())
|
||||
.getOrElseF(logInactiveWarning(ctx.logger))
|
||||
}
|
||||
|
||||
private def handleModel[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, Args],
|
||||
blocker: Blocker
|
||||
)(trainedModel: ClassifierModel): F[Unit] =
|
||||
for {
|
||||
oldFile <- ctx.store.transact(
|
||||
RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId))
|
||||
)
|
||||
_ <- ctx.logger.info("Storing new trained model")
|
||||
fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096)
|
||||
newFile <-
|
||||
ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
|
||||
_ <- ctx.store.transact(
|
||||
RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id))
|
||||
)
|
||||
_ <- ctx.logger.debug(s"New model stored at file ${newFile.id}")
|
||||
_ <- oldFile match {
|
||||
case Some(fid) =>
|
||||
ctx.logger.debug(s"Deleting old model file ${fid.id}") *>
|
||||
ctx.store.bitpeace.delete(fid.id).compile.drain
|
||||
case None => ().pure[F]
|
||||
}
|
||||
} yield ()
|
||||
|
||||
private def selectItems[F[_]](
|
||||
ctx: Context[F, Args],
|
||||
max: Long,
|
||||
category: String
|
||||
): Stream[F, Data] = {
|
||||
val connStream =
|
||||
for {
|
||||
item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
|
||||
tt <- Stream.eval(
|
||||
QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
|
||||
)
|
||||
} yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
|
||||
ctx.store.transact(connStream.filter(_.text.nonEmpty))
|
||||
}
|
||||
|
||||
private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] =
|
||||
if (max <= 0) identity
|
||||
else _.take(max)
|
||||
|
||||
private def findActiveSettings[F[_]: Sync](
|
||||
ctx: Context[F, Args],
|
||||
cfg: Config.TextAnalysis
|
||||
): OptionT[F, OCollective.Classifier] =
|
||||
if (cfg.classification.enabled)
|
||||
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
|
||||
.filter(_.enabled)
|
||||
.filter(_.category.nonEmpty)
|
||||
.map(OCollective.Classifier.fromRecord)
|
||||
else
|
||||
OptionT.none
|
||||
|
||||
private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
|
||||
logger.warn(
|
||||
"Classification is disabled. Check joex config and the collective settings."
|
||||
)
|
||||
}
|
@ -38,6 +38,9 @@ case class ItemData(
|
||||
copy(metas = next)
|
||||
}
|
||||
|
||||
def appendTags(tags: Seq[String]): ItemData =
|
||||
copy(tags = (this.tags ++ tags.toList).distinct)
|
||||
|
||||
def changeMeta(
|
||||
attachId: Ident,
|
||||
f: RAttachmentMeta => RAttachmentMeta
|
||||
|
@ -34,12 +34,12 @@ object ProcessItem {
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
|
||||
|
||||
def analysisOnly[F[_]: Sync](
|
||||
def analysisOnly[F[_]: Sync: ContextShift](
|
||||
cfg: Config,
|
||||
analyser: TextAnalyser[F],
|
||||
regexNer: RegexNerFile[F]
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
TextAnalysis[F](analyser, regexNer)(item)
|
||||
TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item)
|
||||
.flatMap(FindProposal[F](cfg.processing))
|
||||
.flatMap(EvalProposals[F])
|
||||
.flatMap(SaveProposals[F])
|
||||
|
@ -1,23 +1,33 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.TextAnalyser
|
||||
import docspell.analysis.nlp.StanfordSettings
|
||||
import docspell.analysis.nlp.ClassifierModel
|
||||
import docspell.analysis.nlp.StanfordNerSettings
|
||||
import docspell.analysis.nlp.TextClassifier
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
import docspell.joex.learn.LearnClassifierTask
|
||||
import docspell.joex.process.ItemData.AttachmentDates
|
||||
import docspell.joex.scheduler.Context
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.records.RAttachmentMeta
|
||||
import docspell.store.records.RClassifierSetting
|
||||
|
||||
import bitpeace.RangeDef
|
||||
|
||||
object TextAnalysis {
|
||||
type Args = ProcessItemArgs
|
||||
|
||||
def apply[F[_]: Sync](
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
cfg: Config.TextAnalysis,
|
||||
analyser: TextAnalyser[F],
|
||||
nerFile: RegexNerFile[F]
|
||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||
)(item: ItemData): Task[F, Args, ItemData] =
|
||||
Task { ctx =>
|
||||
for {
|
||||
_ <- ctx.logger.info("Starting text analysis")
|
||||
@ -34,15 +44,18 @@ object TextAnalysis {
|
||||
e <- s
|
||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||
v = t.toVector
|
||||
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||
tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value
|
||||
} yield item
|
||||
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||
.appendTags(tag.toSeq)
|
||||
}
|
||||
|
||||
def annotateAttachment[F[_]: Sync](
|
||||
ctx: Context[F, ProcessItemArgs],
|
||||
ctx: Context[F, Args],
|
||||
analyser: TextAnalyser[F],
|
||||
nerFile: RegexNerFile[F]
|
||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||
val settings = StanfordSettings(ctx.args.meta.language, false, None)
|
||||
val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
|
||||
for {
|
||||
customNer <- nerFile.makeFile(ctx.args.meta.collective)
|
||||
sett = settings.copy(regexNer = customNer)
|
||||
@ -54,4 +67,42 @@ object TextAnalysis {
|
||||
)
|
||||
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||
}
|
||||
|
||||
def predictTag[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, Args],
|
||||
cfg: Config.TextAnalysis,
|
||||
metas: Vector[RAttachmentMeta],
|
||||
classifier: TextClassifier[F]
|
||||
): OptionT[F, String] =
|
||||
for {
|
||||
model <- findActiveModel(ctx, cfg)
|
||||
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
|
||||
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||
modelData =
|
||||
ctx.store.bitpeace
|
||||
.get(model.id)
|
||||
.unNoneTerminate
|
||||
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||
cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
|
||||
val modelFile = dir.resolve("model.ser.gz")
|
||||
modelData
|
||||
.through(fs2.io.file.writeAll(modelFile, ctx.blocker))
|
||||
.compile
|
||||
.drain
|
||||
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
||||
}).filter(_ != LearnClassifierTask.noClass)
|
||||
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
||||
} yield cls
|
||||
|
||||
private def findActiveModel[F[_]: Sync](
|
||||
ctx: Context[F, Args],
|
||||
cfg: Config.TextAnalysis
|
||||
): OptionT[F, Ident] =
|
||||
if (cfg.classification.enabled)
|
||||
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
|
||||
.filter(_.enabled)
|
||||
.mapFilter(_.fileId)
|
||||
else
|
||||
OptionT.none
|
||||
|
||||
}
|
||||
|
@ -1047,6 +1047,28 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ContactList"
|
||||
|
||||
/sec/collective/classifier/startonce:
|
||||
post:
|
||||
tags: [ Collective ]
|
||||
summary: Starts the learn-classifier task
|
||||
description: |
|
||||
If the collective has classification enabled, this will submit
|
||||
the task for learning a classifier from existing data. This
|
||||
task is usally run periodically as determined by the
|
||||
collective settings.
|
||||
|
||||
The request is empty, settings are used from the collective.
|
||||
security:
|
||||
- authTokenHeader: []
|
||||
responses:
|
||||
200:
|
||||
description: Ok
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/BasicResult"
|
||||
|
||||
/sec/user:
|
||||
get:
|
||||
tags: [ Collective ]
|
||||
@ -3643,12 +3665,14 @@ components:
|
||||
description: DateTime
|
||||
type: integer
|
||||
format: date-time
|
||||
|
||||
CollectiveSettings:
|
||||
description: |
|
||||
Settings for a collective.
|
||||
required:
|
||||
- language
|
||||
- integrationEnabled
|
||||
- classifier
|
||||
properties:
|
||||
language:
|
||||
type: string
|
||||
@ -3658,6 +3682,31 @@ components:
|
||||
description: |
|
||||
Whether the collective has the integration endpoint
|
||||
enabled.
|
||||
classifier:
|
||||
$ref: "#/components/schemas/ClassifierSetting"
|
||||
|
||||
ClassifierSetting:
|
||||
description: |
|
||||
Settings for learning a document classifier.
|
||||
required:
|
||||
- enabled
|
||||
- schedule
|
||||
- itemCount
|
||||
properties:
|
||||
enabled:
|
||||
type: boolean
|
||||
category:
|
||||
type: string
|
||||
itemCount:
|
||||
type: integer
|
||||
format: int32
|
||||
description: |
|
||||
The max. number of items to learn from. The newest items
|
||||
are considered.
|
||||
schedule:
|
||||
type: string
|
||||
format: calevent
|
||||
|
||||
SourceList:
|
||||
description: |
|
||||
A list of sources.
|
||||
|
@ -10,6 +10,7 @@ import docspell.restapi.model._
|
||||
import docspell.restserver.conv.Conversions
|
||||
import docspell.restserver.http4s._
|
||||
|
||||
import com.github.eikek.calev.CalEvent
|
||||
import org.http4s.HttpRoutes
|
||||
import org.http4s.circe.CirceEntityDecoder._
|
||||
import org.http4s.circe.CirceEntityEncoder._
|
||||
@ -37,7 +38,18 @@ object CollectiveRoutes {
|
||||
case req @ POST -> Root / "settings" =>
|
||||
for {
|
||||
settings <- req.as[CollectiveSettings]
|
||||
sett = OCollective.Settings(settings.language, settings.integrationEnabled)
|
||||
sett = OCollective.Settings(
|
||||
settings.language,
|
||||
settings.integrationEnabled,
|
||||
Some(
|
||||
OCollective.Classifier(
|
||||
settings.classifier.enabled,
|
||||
settings.classifier.schedule,
|
||||
settings.classifier.itemCount,
|
||||
settings.classifier.category
|
||||
)
|
||||
)
|
||||
)
|
||||
res <-
|
||||
backend.collective
|
||||
.updateSettings(user.account.collective, sett)
|
||||
@ -46,8 +58,21 @@ object CollectiveRoutes {
|
||||
|
||||
case GET -> Root / "settings" =>
|
||||
for {
|
||||
collDb <- backend.collective.find(user.account.collective)
|
||||
sett = collDb.map(c => CollectiveSettings(c.language, c.integrationEnabled))
|
||||
settDb <- backend.collective.findSettings(user.account.collective)
|
||||
sett = settDb.map(c =>
|
||||
CollectiveSettings(
|
||||
c.language,
|
||||
c.integrationEnabled,
|
||||
ClassifierSetting(
|
||||
c.classifier.map(_.enabled).getOrElse(false),
|
||||
c.classifier.flatMap(_.category),
|
||||
c.classifier.map(_.itemCount).getOrElse(0),
|
||||
c.classifier
|
||||
.map(_.schedule)
|
||||
.getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00"))
|
||||
)
|
||||
)
|
||||
)
|
||||
resp <- sett.toResponse()
|
||||
} yield resp
|
||||
|
||||
@ -63,6 +88,12 @@ object CollectiveRoutes {
|
||||
resp <- Ok(ContactList(res.map(Conversions.mkContact)))
|
||||
} yield resp
|
||||
|
||||
case POST -> Root / "classifier" / "startonce" =>
|
||||
for {
|
||||
_ <- backend.collective.startLearnClassifier(user.account.collective)
|
||||
resp <- Ok(BasicResult(true, "Task submitted"))
|
||||
} yield resp
|
||||
|
||||
case GET -> Root =>
|
||||
for {
|
||||
collDb <- backend.collective.find(user.account.collective)
|
||||
|
@ -0,0 +1,9 @@
|
||||
CREATE TABLE `classifier_setting` (
|
||||
`cid` varchar(254) not null primary key,
|
||||
`enabled` boolean not null,
|
||||
`schedule` varchar(254) not null,
|
||||
`category` varchar(254) not null,
|
||||
`file_id` varchar(254),
|
||||
`created` timestamp not null,
|
||||
foreign key (`cid`) references `collective`(`cid`)
|
||||
);
|
@ -0,0 +1,11 @@
|
||||
CREATE TABLE "classifier_setting" (
|
||||
"cid" varchar(254) not null primary key,
|
||||
"enabled" boolean not null,
|
||||
"schedule" varchar(254) not null,
|
||||
"category" varchar(254) not null,
|
||||
"item_count" int not null,
|
||||
"file_id" varchar(254),
|
||||
"created" timestamp not null,
|
||||
foreign key ("cid") references "collective"("cid"),
|
||||
foreign key ("file_id") references "filemeta"("id")
|
||||
);
|
@ -67,8 +67,8 @@ trait DoobieSyntax {
|
||||
Fragment.const(" FROM ") ++ table ++ this.where(where)
|
||||
|
||||
def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment =
|
||||
Fragment.const("SELECT DISTINCT(") ++ commas(cols.map(_.f)) ++
|
||||
Fragment.const(") FROM ") ++ table ++ this.where(where)
|
||||
Fragment.const("SELECT DISTINCT ") ++ commas(cols.map(_.f)) ++
|
||||
Fragment.const(" FROM ") ++ table ++ this.where(where)
|
||||
|
||||
def selectCount(col: Column, table: Fragment, where: Fragment): Fragment =
|
||||
Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this
|
||||
|
@ -7,6 +7,7 @@ import cats.effect.concurrent.Ref
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common.syntax.all._
|
||||
import docspell.common.{IdRef, _}
|
||||
import docspell.store.Store
|
||||
import docspell.store.impl.Implicits._
|
||||
@ -615,4 +616,75 @@ object QItem {
|
||||
.query[NameAndNotes]
|
||||
.streamWithChunkSize(chunkSize)
|
||||
}
|
||||
|
||||
def findAllNewesFirst(
|
||||
collective: Ident,
|
||||
chunkSize: Int
|
||||
): Stream[ConnectionIO, Ident] = {
|
||||
val cols = Seq(RItem.Columns.id)
|
||||
(selectSimple(cols, RItem.table, RItem.Columns.cid.is(collective)) ++
|
||||
orderBy(RItem.Columns.created.desc))
|
||||
.query[Ident]
|
||||
.streamWithChunkSize(chunkSize)
|
||||
}
|
||||
|
||||
case class TagName(id: Ident, name: String)
|
||||
case class TextAndTag(itemId: Ident, text: String, tag: Option[TagName])
|
||||
|
||||
def resolveTextAndTag(
|
||||
collective: Ident,
|
||||
itemId: Ident,
|
||||
tagCategory: String,
|
||||
pageSep: String
|
||||
): ConnectionIO[TextAndTag] = {
|
||||
val aId = RAttachment.Columns.id.prefix("a")
|
||||
val aItem = RAttachment.Columns.itemId.prefix("a")
|
||||
val mId = RAttachmentMeta.Columns.id.prefix("m")
|
||||
val mText = RAttachmentMeta.Columns.content.prefix("m")
|
||||
val tiItem = RTagItem.Columns.itemId.prefix("ti")
|
||||
val tiTag = RTagItem.Columns.tagId.prefix("ti")
|
||||
val tId = RTag.Columns.tid.prefix("t")
|
||||
val tName = RTag.Columns.name.prefix("t")
|
||||
val tCat = RTag.Columns.category.prefix("t")
|
||||
val iId = RItem.Columns.id.prefix("i")
|
||||
val iColl = RItem.Columns.cid.prefix("i")
|
||||
|
||||
val cte = withCTE(
|
||||
"tags" -> selectSimple(
|
||||
Seq(tiItem, tId, tName),
|
||||
RTagItem.table ++ fr"ti INNER JOIN" ++
|
||||
RTag.table ++ fr"t ON" ++ tId.is(tiTag),
|
||||
and(tiItem.is(itemId), tCat.is(tagCategory))
|
||||
)
|
||||
)
|
||||
|
||||
val cols = Seq(mText, tId, tName)
|
||||
|
||||
val from = RItem.table ++ fr"i INNER JOIN" ++
|
||||
RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ fr"INNER JOIN" ++
|
||||
RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ fr"LEFT JOIN" ++
|
||||
fr"tags t ON" ++ RTagItem.Columns.itemId.prefix("t").is(iId)
|
||||
|
||||
val where =
|
||||
and(
|
||||
iId.is(itemId),
|
||||
iColl.is(collective),
|
||||
mText.isNotNull,
|
||||
mText.isNot("")
|
||||
)
|
||||
|
||||
val q = cte ++ selectDistinct(cols, from, where)
|
||||
for {
|
||||
_ <- logger.ftrace[ConnectionIO](
|
||||
s"query: $q (${itemId.id}, ${collective.id}, ${tagCategory})"
|
||||
)
|
||||
texts <- q.query[(String, Option[TagName])].to[List]
|
||||
_ <- logger.ftrace[ConnectionIO](
|
||||
s"Got ${texts.size} text and tag entries for item ${itemId.id}"
|
||||
)
|
||||
tag = texts.headOption.flatMap(_._2)
|
||||
txt = texts.map(_._1).mkString(pageSep)
|
||||
} yield TextAndTag(itemId, txt, tag)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,113 @@
|
||||
package docspell.store.records
|
||||
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.common._
|
||||
import docspell.store.impl.Implicits._
|
||||
import docspell.store.impl._
|
||||
|
||||
import com.github.eikek.calev._
|
||||
import doobie._
|
||||
import doobie.implicits._
|
||||
|
||||
case class RClassifierSetting(
|
||||
cid: Ident,
|
||||
enabled: Boolean,
|
||||
schedule: CalEvent,
|
||||
category: String,
|
||||
itemCount: Int,
|
||||
fileId: Option[Ident],
|
||||
created: Timestamp
|
||||
) {}
|
||||
|
||||
object RClassifierSetting {
|
||||
|
||||
val table = fr"classifier_setting"
|
||||
|
||||
object Columns {
|
||||
val cid = Column("cid")
|
||||
val enabled = Column("enabled")
|
||||
val schedule = Column("schedule")
|
||||
val category = Column("category")
|
||||
val itemCount = Column("item_count")
|
||||
val fileId = Column("file_id")
|
||||
val created = Column("created")
|
||||
val all = List(cid, enabled, schedule, category, itemCount, fileId, created)
|
||||
}
|
||||
import Columns._
|
||||
|
||||
def insert(v: RClassifierSetting): ConnectionIO[Int] = {
|
||||
val sql =
|
||||
insertRow(
|
||||
table,
|
||||
all,
|
||||
fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}"
|
||||
)
|
||||
sql.update.run
|
||||
}
|
||||
|
||||
def updateAll(v: RClassifierSetting): ConnectionIO[Int] = {
|
||||
val sql = updateRow(
|
||||
table,
|
||||
cid.is(v.cid),
|
||||
commas(
|
||||
enabled.setTo(v.enabled),
|
||||
schedule.setTo(v.schedule),
|
||||
category.setTo(v.category),
|
||||
itemCount.setTo(v.itemCount),
|
||||
fileId.setTo(v.fileId)
|
||||
)
|
||||
)
|
||||
sql.update.run
|
||||
}
|
||||
|
||||
def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] =
|
||||
updateRow(table, cid.is(coll), fileId.setTo(fid)).update.run
|
||||
|
||||
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
|
||||
for {
|
||||
n1 <- updateRow(
|
||||
table,
|
||||
cid.is(v.cid),
|
||||
commas(
|
||||
enabled.setTo(v.enabled),
|
||||
schedule.setTo(v.schedule),
|
||||
itemCount.setTo(v.itemCount),
|
||||
category.setTo(v.category)
|
||||
)
|
||||
).update.run
|
||||
n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
|
||||
} yield n1 + n2
|
||||
|
||||
def findById(id: Ident): ConnectionIO[Option[RClassifierSetting]] = {
|
||||
val sql = selectSimple(all, table, cid.is(id))
|
||||
sql.query[RClassifierSetting].option
|
||||
}
|
||||
|
||||
def delete(coll: Ident): ConnectionIO[Int] =
|
||||
deleteFrom(table, cid.is(coll)).update.run
|
||||
|
||||
case class Classifier(
|
||||
enabled: Boolean,
|
||||
schedule: CalEvent,
|
||||
itemCount: Int,
|
||||
category: Option[String]
|
||||
) {
|
||||
|
||||
def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
|
||||
RClassifierSetting(
|
||||
coll,
|
||||
enabled,
|
||||
schedule,
|
||||
category.getOrElse(""),
|
||||
itemCount,
|
||||
None,
|
||||
created
|
||||
)
|
||||
}
|
||||
object Classifier {
|
||||
def fromRecord(r: RClassifierSetting): Classifier =
|
||||
Classifier(r.enabled, r.schedule, r.itemCount, r.category.some)
|
||||
}
|
||||
|
||||
}
|
@ -61,14 +61,47 @@ object RCollective {
|
||||
updateRow(table, id.is(cid), language.setTo(lang)).update.run
|
||||
|
||||
def updateSettings(cid: Ident, settings: Settings): ConnectionIO[Int] =
|
||||
updateRow(
|
||||
table,
|
||||
id.is(cid),
|
||||
commas(
|
||||
language.setTo(settings.language),
|
||||
integration.setTo(settings.integrationEnabled)
|
||||
)
|
||||
).update.run
|
||||
for {
|
||||
n1 <- updateRow(
|
||||
table,
|
||||
id.is(cid),
|
||||
commas(
|
||||
language.setTo(settings.language),
|
||||
integration.setTo(settings.integrationEnabled)
|
||||
)
|
||||
).update.run
|
||||
cls <-
|
||||
Timestamp
|
||||
.current[ConnectionIO]
|
||||
.map(now => settings.classifier.map(_.toRecord(cid, now)))
|
||||
n2 <- cls match {
|
||||
case Some(cr) =>
|
||||
RClassifierSetting.updateSettings(cr)
|
||||
case None =>
|
||||
RClassifierSetting.delete(cid)
|
||||
}
|
||||
} yield n1 + n2
|
||||
|
||||
def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
|
||||
val cId = id.prefix("c")
|
||||
val CS = RClassifierSetting.Columns
|
||||
val csCid = CS.cid.prefix("cs")
|
||||
|
||||
val cols = Seq(
|
||||
language.prefix("c"),
|
||||
integration.prefix("c"),
|
||||
CS.enabled.prefix("cs"),
|
||||
CS.schedule.prefix("cs"),
|
||||
CS.itemCount.prefix("cs"),
|
||||
CS.category.prefix("cs")
|
||||
)
|
||||
val from = table ++ fr"c LEFT JOIN" ++
|
||||
RClassifierSetting.table ++ fr"cs ON" ++ csCid.is(cId)
|
||||
|
||||
selectSimple(cols, from, cId.is(coll))
|
||||
.query[Settings]
|
||||
.option
|
||||
}
|
||||
|
||||
def findById(cid: Ident): ConnectionIO[Option[RCollective]] = {
|
||||
val sql = selectSimple(all, table, id.is(cid))
|
||||
@ -112,5 +145,10 @@ object RCollective {
|
||||
selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option
|
||||
}
|
||||
|
||||
case class Settings(language: Language, integrationEnabled: Boolean)
|
||||
case class Settings(
|
||||
language: Language,
|
||||
integrationEnabled: Boolean,
|
||||
classifier: Option[RClassifierSetting.Classifier]
|
||||
)
|
||||
|
||||
}
|
||||
|
@ -88,6 +88,7 @@ module Api exposing
|
||||
, setItemNotes
|
||||
, setTags
|
||||
, setUnconfirmed
|
||||
, startClassifier
|
||||
, startOnceNotifyDueItems
|
||||
, startOnceScanMailbox
|
||||
, startReIndex
|
||||
@ -795,6 +796,19 @@ versionInfo flags receive =
|
||||
--- Collective
|
||||
|
||||
|
||||
startClassifier :
|
||||
Flags
|
||||
-> (Result Http.Error BasicResult -> msg)
|
||||
-> Cmd msg
|
||||
startClassifier flags receive =
|
||||
Http2.authPost
|
||||
{ url = flags.config.baseUrl ++ "/api/v1/sec/collective/classifier/startonce"
|
||||
, account = getAccount flags
|
||||
, body = Http.emptyBody
|
||||
, expect = Http.expectJson receive Api.Model.BasicResult.decoder
|
||||
}
|
||||
|
||||
|
||||
getTagCloud : Flags -> (Result Http.Error TagCloud -> msg) -> Cmd msg
|
||||
getTagCloud flags receive =
|
||||
Http2.authGet
|
||||
|
@ -218,12 +218,12 @@ loginInfo model =
|
||||
, menuEntry model
|
||||
CollectiveSettingPage
|
||||
[ i [ class "users circle icon" ] []
|
||||
, text "Collective Settings"
|
||||
, text "Collective Profile"
|
||||
]
|
||||
, menuEntry model
|
||||
UserSettingPage
|
||||
[ i [ class "user circle icon" ] []
|
||||
, text "User Settings"
|
||||
, text "User Profile"
|
||||
]
|
||||
, div [ class "divider" ] []
|
||||
, menuEntry model
|
||||
|
204
modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
Normal file
204
modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
Normal file
@ -0,0 +1,204 @@
|
||||
module Comp.ClassifierSettingsForm exposing
|
||||
( Model
|
||||
, Msg
|
||||
, getSettings
|
||||
, init
|
||||
, update
|
||||
, view
|
||||
)
|
||||
|
||||
import Api
|
||||
import Api.Model.ClassifierSetting exposing (ClassifierSetting)
|
||||
import Api.Model.TagList exposing (TagList)
|
||||
import Comp.CalEventInput
|
||||
import Comp.FixedDropdown
|
||||
import Comp.IntField
|
||||
import Data.CalEvent exposing (CalEvent)
|
||||
import Data.Flags exposing (Flags)
|
||||
import Data.Validated exposing (Validated(..))
|
||||
import Html exposing (..)
|
||||
import Html.Attributes exposing (..)
|
||||
import Html.Events exposing (onCheck)
|
||||
import Http
|
||||
import Util.Tag
|
||||
|
||||
|
||||
type alias Model =
|
||||
{ enabled : Bool
|
||||
, categoryModel : Comp.FixedDropdown.Model String
|
||||
, category : Maybe String
|
||||
, scheduleModel : Comp.CalEventInput.Model
|
||||
, schedule : Validated CalEvent
|
||||
, itemCountModel : Comp.IntField.Model
|
||||
, itemCount : Maybe Int
|
||||
}
|
||||
|
||||
|
||||
type Msg
|
||||
= GetTagsResp (Result Http.Error TagList)
|
||||
| ScheduleMsg Comp.CalEventInput.Msg
|
||||
| ToggleEnabled
|
||||
| CategoryMsg (Comp.FixedDropdown.Msg String)
|
||||
| ItemCountMsg Comp.IntField.Msg
|
||||
|
||||
|
||||
init : Flags -> ClassifierSetting -> ( Model, Cmd Msg )
|
||||
init flags sett =
|
||||
let
|
||||
newSchedule =
|
||||
Data.CalEvent.fromEvent sett.schedule
|
||||
|> Maybe.withDefault Data.CalEvent.everyMonth
|
||||
|
||||
( cem, cec ) =
|
||||
Comp.CalEventInput.init flags newSchedule
|
||||
in
|
||||
( { enabled = sett.enabled
|
||||
, categoryModel = Comp.FixedDropdown.initString []
|
||||
, category = sett.category
|
||||
, scheduleModel = cem
|
||||
, schedule = Data.Validated.Unknown newSchedule
|
||||
, itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
|
||||
, itemCount = Just sett.itemCount
|
||||
}
|
||||
, Cmd.batch
|
||||
[ Api.getTags flags "" GetTagsResp
|
||||
, Cmd.map ScheduleMsg cec
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
getSettings : Model -> Validated ClassifierSetting
|
||||
getSettings model =
|
||||
Data.Validated.map
|
||||
(\sch ->
|
||||
{ enabled = model.enabled
|
||||
, category = model.category
|
||||
, schedule =
|
||||
Data.CalEvent.makeEvent sch
|
||||
, itemCount = Maybe.withDefault 0 model.itemCount
|
||||
}
|
||||
)
|
||||
model.schedule
|
||||
|
||||
|
||||
update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
|
||||
update flags msg model =
|
||||
case msg of
|
||||
GetTagsResp (Ok tl) ->
|
||||
let
|
||||
categories =
|
||||
Util.Tag.getCategories tl.items
|
||||
|> List.sort
|
||||
in
|
||||
( { model
|
||||
| categoryModel = Comp.FixedDropdown.initString categories
|
||||
, category =
|
||||
if model.category == Nothing then
|
||||
List.head categories
|
||||
|
||||
else
|
||||
model.category
|
||||
}
|
||||
, Cmd.none
|
||||
)
|
||||
|
||||
GetTagsResp (Err _) ->
|
||||
( model, Cmd.none )
|
||||
|
||||
ScheduleMsg lmsg ->
|
||||
let
|
||||
( cm, cc, ce ) =
|
||||
Comp.CalEventInput.update
|
||||
flags
|
||||
(Data.Validated.value model.schedule)
|
||||
lmsg
|
||||
model.scheduleModel
|
||||
in
|
||||
( { model
|
||||
| scheduleModel = cm
|
||||
, schedule = ce
|
||||
}
|
||||
, Cmd.map ScheduleMsg cc
|
||||
)
|
||||
|
||||
ToggleEnabled ->
|
||||
( { model | enabled = not model.enabled }
|
||||
, Cmd.none
|
||||
)
|
||||
|
||||
CategoryMsg lmsg ->
|
||||
let
|
||||
( mm, ma ) =
|
||||
Comp.FixedDropdown.update lmsg model.categoryModel
|
||||
in
|
||||
( { model
|
||||
| categoryModel = mm
|
||||
, category =
|
||||
if ma == Nothing then
|
||||
model.category
|
||||
|
||||
else
|
||||
ma
|
||||
}
|
||||
, Cmd.none
|
||||
)
|
||||
|
||||
ItemCountMsg lmsg ->
|
||||
let
|
||||
( im, iv ) =
|
||||
Comp.IntField.update lmsg model.itemCountModel
|
||||
in
|
||||
( { model
|
||||
| itemCountModel = im
|
||||
, itemCount = iv
|
||||
}
|
||||
, Cmd.none
|
||||
)
|
||||
|
||||
|
||||
view : Model -> Html Msg
|
||||
view model =
|
||||
div []
|
||||
[ div
|
||||
[ class "field"
|
||||
]
|
||||
[ div [ class "ui checkbox" ]
|
||||
[ input
|
||||
[ type_ "checkbox"
|
||||
, onCheck (\_ -> ToggleEnabled)
|
||||
, checked model.enabled
|
||||
]
|
||||
[]
|
||||
, label [] [ text "Enable classification" ]
|
||||
, span [ class "small-info" ]
|
||||
[ text "Disable document classification if not needed."
|
||||
]
|
||||
]
|
||||
]
|
||||
, div [ class "ui basic segment" ]
|
||||
[ text "Document classification tries to predict a tag for new incoming documents. This "
|
||||
, text "works by learning from existing documents in order to find common patterns within "
|
||||
, text "the text. The more documents you have correctly tagged, the better. Learning is done "
|
||||
, text "periodically based on a schedule and you need to specify a tag-group that should "
|
||||
, text "be used for learning."
|
||||
]
|
||||
, div [ class "field" ]
|
||||
[ label [] [ text "Category" ]
|
||||
, Html.map CategoryMsg
|
||||
(Comp.FixedDropdown.viewString model.category
|
||||
model.categoryModel
|
||||
)
|
||||
]
|
||||
, Html.map ItemCountMsg
|
||||
(Comp.IntField.viewWithInfo
|
||||
"The maximum number of items to learn from, order by date newest first. Use 0 to mean all."
|
||||
model.itemCount
|
||||
"field"
|
||||
model.itemCountModel
|
||||
)
|
||||
, div [ class "field" ]
|
||||
[ label [] [ text "Schedule" ]
|
||||
, Html.map ScheduleMsg
|
||||
(Comp.CalEventInput.view "" (Data.Validated.value model.schedule) model.scheduleModel)
|
||||
]
|
||||
]
|
@ -10,10 +10,12 @@ module Comp.CollectiveSettingsForm exposing
|
||||
import Api
|
||||
import Api.Model.BasicResult exposing (BasicResult)
|
||||
import Api.Model.CollectiveSettings exposing (CollectiveSettings)
|
||||
import Comp.ClassifierSettingsForm
|
||||
import Comp.Dropdown
|
||||
import Data.Flags exposing (Flags)
|
||||
import Data.Language exposing (Language)
|
||||
import Data.UiSettings exposing (UiSettings)
|
||||
import Data.Validated exposing (Validated)
|
||||
import Html exposing (..)
|
||||
import Html.Attributes exposing (..)
|
||||
import Html.Events exposing (onCheck, onClick, onInput)
|
||||
@ -27,44 +29,60 @@ type alias Model =
|
||||
, initSettings : CollectiveSettings
|
||||
, fullTextConfirmText : String
|
||||
, fullTextReIndexResult : Maybe BasicResult
|
||||
, classifierModel : Comp.ClassifierSettingsForm.Model
|
||||
, startClassifierResult : Maybe BasicResult
|
||||
}
|
||||
|
||||
|
||||
init : CollectiveSettings -> Model
|
||||
init settings =
|
||||
init : Flags -> CollectiveSettings -> ( Model, Cmd Msg )
|
||||
init flags settings =
|
||||
let
|
||||
lang =
|
||||
Data.Language.fromString settings.language
|
||||
|> Maybe.withDefault Data.Language.German
|
||||
|
||||
( cm, cc ) =
|
||||
Comp.ClassifierSettingsForm.init flags settings.classifier
|
||||
in
|
||||
{ langModel =
|
||||
Comp.Dropdown.makeSingleList
|
||||
{ makeOption =
|
||||
\l ->
|
||||
{ value = Data.Language.toIso3 l
|
||||
, text = Data.Language.toName l
|
||||
, additional = ""
|
||||
}
|
||||
, placeholder = ""
|
||||
, options = Data.Language.all
|
||||
, selected = Just lang
|
||||
}
|
||||
, intEnabled = settings.integrationEnabled
|
||||
, initSettings = settings
|
||||
, fullTextConfirmText = ""
|
||||
, fullTextReIndexResult = Nothing
|
||||
}
|
||||
( { langModel =
|
||||
Comp.Dropdown.makeSingleList
|
||||
{ makeOption =
|
||||
\l ->
|
||||
{ value = Data.Language.toIso3 l
|
||||
, text = Data.Language.toName l
|
||||
, additional = ""
|
||||
}
|
||||
, placeholder = ""
|
||||
, options = Data.Language.all
|
||||
, selected = Just lang
|
||||
}
|
||||
, intEnabled = settings.integrationEnabled
|
||||
, initSettings = settings
|
||||
, fullTextConfirmText = ""
|
||||
, fullTextReIndexResult = Nothing
|
||||
, classifierModel = cm
|
||||
, startClassifierResult = Nothing
|
||||
}
|
||||
, Cmd.map ClassifierSettingMsg cc
|
||||
)
|
||||
|
||||
|
||||
getSettings : Model -> CollectiveSettings
|
||||
getSettings : Model -> Validated CollectiveSettings
|
||||
getSettings model =
|
||||
CollectiveSettings
|
||||
(Comp.Dropdown.getSelected model.langModel
|
||||
|> List.head
|
||||
|> Maybe.map Data.Language.toIso3
|
||||
|> Maybe.withDefault model.initSettings.language
|
||||
Data.Validated.map
|
||||
(\cls ->
|
||||
{ language =
|
||||
Comp.Dropdown.getSelected model.langModel
|
||||
|> List.head
|
||||
|> Maybe.map Data.Language.toIso3
|
||||
|> Maybe.withDefault model.initSettings.language
|
||||
, integrationEnabled = model.intEnabled
|
||||
, classifier = cls
|
||||
}
|
||||
)
|
||||
(Comp.ClassifierSettingsForm.getSettings
|
||||
model.classifierModel
|
||||
)
|
||||
model.intEnabled
|
||||
|
||||
|
||||
type Msg
|
||||
@ -73,6 +91,10 @@ type Msg
|
||||
| SetFullTextConfirm String
|
||||
| TriggerReIndex
|
||||
| TriggerReIndexResult (Result Http.Error BasicResult)
|
||||
| ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg
|
||||
| SaveSettings
|
||||
| StartClassifierTask
|
||||
| StartClassifierResp (Result Http.Error BasicResult)
|
||||
|
||||
|
||||
update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings )
|
||||
@ -85,22 +107,15 @@ update flags msg model =
|
||||
|
||||
nextModel =
|
||||
{ model | langModel = m2 }
|
||||
|
||||
nextSettings =
|
||||
if Comp.Dropdown.isDropdownChangeMsg m then
|
||||
Just (getSettings nextModel)
|
||||
|
||||
else
|
||||
Nothing
|
||||
in
|
||||
( nextModel, Cmd.map LangDropdownMsg c2, nextSettings )
|
||||
( nextModel, Cmd.map LangDropdownMsg c2, Nothing )
|
||||
|
||||
ToggleIntegrationEndpoint ->
|
||||
let
|
||||
nextModel =
|
||||
{ model | intEnabled = not model.intEnabled }
|
||||
in
|
||||
( nextModel, Cmd.none, Just (getSettings nextModel) )
|
||||
( nextModel, Cmd.none, Nothing )
|
||||
|
||||
SetFullTextConfirm str ->
|
||||
( { model | fullTextConfirmText = str }, Cmd.none, Nothing )
|
||||
@ -138,12 +153,50 @@ update flags msg model =
|
||||
, Nothing
|
||||
)
|
||||
|
||||
ClassifierSettingMsg lmsg ->
|
||||
let
|
||||
( cm, cc ) =
|
||||
Comp.ClassifierSettingsForm.update flags lmsg model.classifierModel
|
||||
in
|
||||
( { model
|
||||
| classifierModel = cm
|
||||
}
|
||||
, Cmd.map ClassifierSettingMsg cc
|
||||
, Nothing
|
||||
)
|
||||
|
||||
SaveSettings ->
|
||||
case getSettings model of
|
||||
Data.Validated.Valid s ->
|
||||
( model, Cmd.none, Just s )
|
||||
|
||||
_ ->
|
||||
( model, Cmd.none, Nothing )
|
||||
|
||||
StartClassifierTask ->
|
||||
( model, Api.startClassifier flags StartClassifierResp, Nothing )
|
||||
|
||||
StartClassifierResp (Ok br) ->
|
||||
( { model | startClassifierResult = Just br }
|
||||
, Cmd.none
|
||||
, Nothing
|
||||
)
|
||||
|
||||
StartClassifierResp (Err err) ->
|
||||
( { model
|
||||
| startClassifierResult =
|
||||
Just (BasicResult False (Util.Http.errorToString err))
|
||||
}
|
||||
, Cmd.none
|
||||
, Nothing
|
||||
)
|
||||
|
||||
|
||||
view : Flags -> UiSettings -> Model -> Html Msg
|
||||
view flags settings model =
|
||||
div
|
||||
[ classList
|
||||
[ ( "ui form", True )
|
||||
[ ( "ui form error success", True )
|
||||
, ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
|
||||
, ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
|
||||
]
|
||||
@ -219,17 +272,62 @@ view flags settings model =
|
||||
[ text "This starts a task that clears the full-text index and re-indexes all your data again."
|
||||
, text "You must type OK before clicking the button to avoid accidental re-indexing."
|
||||
]
|
||||
, div
|
||||
[ classList
|
||||
[ ( "ui message", True )
|
||||
, ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
|
||||
, ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
|
||||
, ( "hidden invisible", model.fullTextReIndexResult == Nothing )
|
||||
]
|
||||
]
|
||||
[ Maybe.map .message model.fullTextReIndexResult
|
||||
|> Maybe.withDefault ""
|
||||
|> text
|
||||
, renderResultMessage model.fullTextReIndexResult
|
||||
]
|
||||
, h3
|
||||
[ classList
|
||||
[ ( "ui dividing header", True )
|
||||
, ( "invisible hidden", False )
|
||||
]
|
||||
]
|
||||
[ text "Document Classifier"
|
||||
]
|
||||
, div
|
||||
[ classList
|
||||
[ ( "field", True )
|
||||
, ( "invisible hidden", False )
|
||||
]
|
||||
]
|
||||
[ Html.map ClassifierSettingMsg
|
||||
(Comp.ClassifierSettingsForm.view model.classifierModel)
|
||||
, div [ class "ui vertical segment" ]
|
||||
[ button
|
||||
[ classList
|
||||
[ ( "ui small secondary basic button", True )
|
||||
, ( "disabled", not model.classifierModel.enabled )
|
||||
]
|
||||
, title "Starts a task to train a classifier"
|
||||
, onClick StartClassifierTask
|
||||
]
|
||||
[ text "Start now"
|
||||
]
|
||||
, renderResultMessage model.startClassifierResult
|
||||
]
|
||||
]
|
||||
, div [ class "ui divider" ] []
|
||||
, button
|
||||
[ classList
|
||||
[ ( "ui primary button", True )
|
||||
, ( "disabled", getSettings model |> Data.Validated.isInvalid )
|
||||
]
|
||||
, onClick SaveSettings
|
||||
]
|
||||
[ text "Save"
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
renderResultMessage : Maybe BasicResult -> Html msg
|
||||
renderResultMessage result =
|
||||
div
|
||||
[ classList
|
||||
[ ( "ui message", True )
|
||||
, ( "error", Maybe.map .success result == Just False )
|
||||
, ( "success", Maybe.map .success result == Just True )
|
||||
, ( "hidden invisible", result == Nothing )
|
||||
]
|
||||
]
|
||||
[ Maybe.map .message result
|
||||
|> Maybe.withDefault ""
|
||||
|> text
|
||||
]
|
||||
|
@ -1,5 +1,6 @@
|
||||
module Data.Validated exposing
|
||||
( Validated(..)
|
||||
, isInvalid
|
||||
, map
|
||||
, map2
|
||||
, map3
|
||||
@ -14,6 +15,19 @@ type Validated a
|
||||
| Unknown a
|
||||
|
||||
|
||||
isInvalid : Validated a -> Bool
|
||||
isInvalid v =
|
||||
case v of
|
||||
Valid _ ->
|
||||
False
|
||||
|
||||
Invalid _ _ ->
|
||||
True
|
||||
|
||||
Unknown _ ->
|
||||
False
|
||||
|
||||
|
||||
value : Validated a -> a
|
||||
value va =
|
||||
case va of
|
||||
|
@ -30,15 +30,21 @@ init flags =
|
||||
let
|
||||
( sm, sc ) =
|
||||
Comp.SourceManage.init flags
|
||||
|
||||
( cm, cc ) =
|
||||
Comp.CollectiveSettingsForm.init flags Api.Model.CollectiveSettings.empty
|
||||
in
|
||||
( { currentTab = Just InsightsTab
|
||||
, sourceModel = sm
|
||||
, userModel = Comp.UserManage.emptyModel
|
||||
, settingsModel = Comp.CollectiveSettingsForm.init Api.Model.CollectiveSettings.empty
|
||||
, settingsModel = cm
|
||||
, insights = Api.Model.ItemInsights.empty
|
||||
, submitResult = Nothing
|
||||
}
|
||||
, Cmd.map SourceMsg sc
|
||||
, Cmd.batch
|
||||
[ Cmd.map SourceMsg sc
|
||||
, Cmd.map SettingsFormMsg cc
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
|
@ -77,7 +77,13 @@ update flags msg model =
|
||||
( model, Cmd.none )
|
||||
|
||||
CollectiveSettingsResp (Ok data) ->
|
||||
( { model | settingsModel = Comp.CollectiveSettingsForm.init data }, Cmd.none )
|
||||
let
|
||||
( cm, cc ) =
|
||||
Comp.CollectiveSettingsForm.init flags data
|
||||
in
|
||||
( { model | settingsModel = cm }
|
||||
, Cmd.map SettingsFormMsg cc
|
||||
)
|
||||
|
||||
CollectiveSettingsResp (Err _) ->
|
||||
( model, Cmd.none )
|
||||
|
@ -185,10 +185,11 @@ viewSettings : Flags -> UiSettings -> Model -> List (Html Msg)
|
||||
viewSettings flags settings model =
|
||||
[ h2 [ class "ui header" ]
|
||||
[ i [ class "cog icon" ] []
|
||||
, text "Settings"
|
||||
, text "Collective Settings"
|
||||
]
|
||||
, div [ class "ui segment" ]
|
||||
[ Html.map SettingsFormMsg (Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
|
||||
[ Html.map SettingsFormMsg
|
||||
(Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
|
||||
]
|
||||
, div
|
||||
[ classList
|
||||
|
@ -95,6 +95,21 @@ let
|
||||
enabled = true;
|
||||
file-cache-time = "1 minute";
|
||||
};
|
||||
classification = {
|
||||
enabled = true;
|
||||
item-count = 0;
|
||||
classifiers = [
|
||||
{ "useSplitWords" = "true";
|
||||
"splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.'';
|
||||
"splitWordsIgnoreRegexp" = ''\s+'';
|
||||
"useSplitPrefixSuffixNGrams" = "true";
|
||||
"maxNGramLeng" = "4";
|
||||
"minNGramLeng" = "1";
|
||||
"splitWordShape" = "chris4";
|
||||
"intern" = "true";
|
||||
}
|
||||
];
|
||||
};
|
||||
working-dir = "/tmp/docspell-analysis";
|
||||
};
|
||||
processing = {
|
||||
@ -736,6 +751,59 @@ in {
|
||||
default = defaults.text-analysis.regex-ner;
|
||||
description = "";
|
||||
};
|
||||
|
||||
classification = mkOption {
|
||||
type = types.submodule({
|
||||
options = {
|
||||
enabled = mkOption {
|
||||
type = types.bool;
|
||||
default = defaults.text-analysis.classification.enabled;
|
||||
description = ''
|
||||
Whether to enable classification globally. Each collective can
|
||||
decide to disable it. If it is disabled here, no collective
|
||||
can use classification.
|
||||
'';
|
||||
};
|
||||
item-count = mkOption {
|
||||
type = types.int;
|
||||
default = defaults.text-analysis.classification.item-count;
|
||||
description = ''
|
||||
If concerned with memory consumption, this restricts the
|
||||
number of items to consider. More are better for training. A
|
||||
negative value or zero means no train on all items.
|
||||
'';
|
||||
};
|
||||
classifiers = mkOption {
|
||||
type = types.listOf types.attrs;
|
||||
default = defaults.text-analysis.classification.classifiers;
|
||||
description = ''
|
||||
These settings are used to configure the classifier. If
|
||||
multiple are given, they are all tried and the "best" is
|
||||
chosen at the end. See
|
||||
https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
|
||||
for more info about these settings. The settings here yielded
|
||||
good results with *my* dataset.
|
||||
'';
|
||||
};
|
||||
|
||||
};
|
||||
});
|
||||
default = defaults.text-analysis.classification;
|
||||
description = ''
|
||||
Settings for doing document classification.
|
||||
|
||||
This works by learning from existing documents. A collective can
|
||||
specify a tag category and the system will try to predict a tag
|
||||
from this category for new incoming documents.
|
||||
|
||||
This requires a satstical model that is computed from all
|
||||
existing documents. This process is run periodically as
|
||||
configured by the collective. It may require a lot of memory,
|
||||
depending on the amount of data.
|
||||
|
||||
It utilises this NLP library: https://nlp.stanford.edu/.
|
||||
'';
|
||||
};
|
||||
};
|
||||
});
|
||||
default = defaults.text-analysis;
|
||||
|
@ -67,7 +67,7 @@ Text is extracted from all files. For scanned documents/images, OCR is used by u
|
||||
, { image = "img/analyze-feature.png"
|
||||
, header = "Text Analysis"
|
||||
, description = """
|
||||
The extracted text is analyzed and is used to find properties that can be annotated to your documents automatically.
|
||||
The extracted text is analyzed using ML techniques to find properties that can be annotated to your documents automatically.
|
||||
"""
|
||||
}
|
||||
, { image = "img/filetype-feature.svg"
|
||||
|
@ -33,11 +33,26 @@ workflows, a tag category *state* may exist that includes tags like
|
||||
"assignment" semantics. Docspell doesn't propose any workflow, but it
|
||||
can help to implement some.
|
||||
|
||||
The tags are *not* taken into account when creating suggestions from
|
||||
analyzed text yet. However, PDF files may contain metadata itself and
|
||||
if there is a metadata *keywords* list, these keywords are matched
|
||||
against the tags in the database. If they match, the item is tagged
|
||||
automatically.
|
||||
Docspell can try to predict a tag for new incoming documents
|
||||
automatically based on your existing data. This requires to train an
|
||||
algorithm. There are some caveats: the more data you have correctly
|
||||
tagged, the better are the results. So it won't work well for maybe
|
||||
the first 100 documents. Then the tags must somehow relate to a
|
||||
pattern in the document text. Tags like *todo* or *waiting* probably
|
||||
won't work, obviously. But the typical "document type" tag, like
|
||||
*invoice* and *receipt* is a good fit! That is why you need to provide
|
||||
a tag category so only sensible tags are being learned. The algorithm
|
||||
goes through all your items and learns patterns in the text that
|
||||
relate to the given tags. This training step can be run periodically,
|
||||
as specified in your collective settings such that docspell keeps
|
||||
learning from your already tagged data! More information about the
|
||||
algorithm can be found in the config, where it is possible to
|
||||
fine-tune this process.
|
||||
|
||||
Another way to have items tagged automatically is when an input PDF
|
||||
file contains a list of keywords in its metadata section (this only
|
||||
applies to PDF files). These keywords are then matched against the
|
||||
tags in the database. If they match, the item is tagged with them.
|
||||
|
||||
|
||||
## Organization and Person
|
||||
|
Loading…
x
Reference in New Issue
Block a user