mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-07 07:35:59 +00:00
commit
8cb78e3dbe
@ -7,18 +7,21 @@ import docspell.analysis.contact.Contact
|
|||||||
import docspell.analysis.date.DateFind
|
import docspell.analysis.date.DateFind
|
||||||
import docspell.analysis.nlp.PipelineCache
|
import docspell.analysis.nlp.PipelineCache
|
||||||
import docspell.analysis.nlp.StanfordNerClassifier
|
import docspell.analysis.nlp.StanfordNerClassifier
|
||||||
import docspell.analysis.nlp.StanfordSettings
|
import docspell.analysis.nlp.StanfordNerSettings
|
||||||
|
import docspell.analysis.nlp.StanfordTextClassifier
|
||||||
|
import docspell.analysis.nlp.TextClassifier
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
|
||||||
trait TextAnalyser[F[_]] {
|
trait TextAnalyser[F[_]] {
|
||||||
|
|
||||||
def annotate(
|
def annotate(
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
settings: StanfordSettings,
|
settings: StanfordNerSettings,
|
||||||
cacheKey: Ident,
|
cacheKey: Ident,
|
||||||
text: String
|
text: String
|
||||||
): F[TextAnalyser.Result]
|
): F[TextAnalyser.Result]
|
||||||
|
|
||||||
|
def classifier(blocker: Blocker)(implicit CS: ContextShift[F]): TextClassifier[F]
|
||||||
}
|
}
|
||||||
object TextAnalyser {
|
object TextAnalyser {
|
||||||
|
|
||||||
@ -35,7 +38,7 @@ object TextAnalyser {
|
|||||||
new TextAnalyser[F] {
|
new TextAnalyser[F] {
|
||||||
def annotate(
|
def annotate(
|
||||||
logger: Logger[F],
|
logger: Logger[F],
|
||||||
settings: StanfordSettings,
|
settings: StanfordNerSettings,
|
||||||
cacheKey: Ident,
|
cacheKey: Ident,
|
||||||
text: String
|
text: String
|
||||||
): F[TextAnalyser.Result] =
|
): F[TextAnalyser.Result] =
|
||||||
@ -48,6 +51,11 @@ object TextAnalyser {
|
|||||||
spans = NerLabelSpan.build(list)
|
spans = NerLabelSpan.build(list)
|
||||||
} yield Result(spans ++ list, dates)
|
} yield Result(spans ++ list, dates)
|
||||||
|
|
||||||
|
def classifier(blocker: Blocker)(implicit
|
||||||
|
CS: ContextShift[F]
|
||||||
|
): TextClassifier[F] =
|
||||||
|
new StanfordTextClassifier[F](cfg.classifier, blocker)
|
||||||
|
|
||||||
private def textLimit(logger: Logger[F], text: String): F[String] =
|
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||||
if (text.length <= cfg.maxLength) text.pure[F]
|
if (text.length <= cfg.maxLength) text.pure[F]
|
||||||
else
|
else
|
||||||
@ -56,7 +64,7 @@ object TextAnalyser {
|
|||||||
s" Analysing only first ${cfg.maxLength} characters."
|
s" Analysing only first ${cfg.maxLength} characters."
|
||||||
) *> text.take(cfg.maxLength).pure[F]
|
) *> text.take(cfg.maxLength).pure[F]
|
||||||
|
|
||||||
private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
|
private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
|
||||||
: F[Vector[NerLabel]] =
|
: F[Vector[NerLabel]] =
|
||||||
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
|
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
|
||||||
|
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
package docspell.analysis
|
package docspell.analysis
|
||||||
|
|
||||||
|
import docspell.analysis.nlp.TextClassifierConfig
|
||||||
|
|
||||||
case class TextAnalysisConfig(
|
case class TextAnalysisConfig(
|
||||||
maxLength: Int
|
maxLength: Int,
|
||||||
|
classifier: TextClassifierConfig
|
||||||
)
|
)
|
||||||
|
@ -0,0 +1,5 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
case class ClassifierModel(model: Path)
|
@ -19,7 +19,7 @@ import org.log4s.getLogger
|
|||||||
*/
|
*/
|
||||||
trait PipelineCache[F[_]] {
|
trait PipelineCache[F[_]] {
|
||||||
|
|
||||||
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
|
def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -28,7 +28,7 @@ object PipelineCache {
|
|||||||
|
|
||||||
def none[F[_]: Applicative]: PipelineCache[F] =
|
def none[F[_]: Applicative]: PipelineCache[F] =
|
||||||
new PipelineCache[F] {
|
new PipelineCache[F] {
|
||||||
def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
|
def obtain(ignored: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
|
||||||
makeClassifier(settings).pure[F]
|
makeClassifier(settings).pure[F]
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -38,7 +38,7 @@ object PipelineCache {
|
|||||||
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
|
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
|
||||||
extends PipelineCache[F] {
|
extends PipelineCache[F] {
|
||||||
|
|
||||||
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
|
def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
|
||||||
for {
|
for {
|
||||||
id <- makeSettingsId(settings)
|
id <- makeSettingsId(settings)
|
||||||
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
|
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
|
||||||
@ -48,7 +48,7 @@ object PipelineCache {
|
|||||||
key: String,
|
key: String,
|
||||||
id: String,
|
id: String,
|
||||||
cache: Map[String, Entry],
|
cache: Map[String, Entry],
|
||||||
settings: StanfordSettings
|
settings: StanfordNerSettings
|
||||||
): (Map[String, Entry], StanfordCoreNLP) =
|
): (Map[String, Entry], StanfordCoreNLP) =
|
||||||
cache.get(key) match {
|
cache.get(key) match {
|
||||||
case Some(entry) =>
|
case Some(entry) =>
|
||||||
@ -68,7 +68,7 @@ object PipelineCache {
|
|||||||
(cache.updated(key, e), nlp)
|
(cache.updated(key, e), nlp)
|
||||||
}
|
}
|
||||||
|
|
||||||
private def makeSettingsId(settings: StanfordSettings): F[String] = {
|
private def makeSettingsId(settings: StanfordNerSettings): F[String] = {
|
||||||
val base = settings.copy(regexNer = None).toString
|
val base = settings.copy(regexNer = None).toString
|
||||||
val size: F[Long] =
|
val size: F[Long] =
|
||||||
settings.regexNer match {
|
settings.regexNer match {
|
||||||
@ -81,7 +81,7 @@ object PipelineCache {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
|
private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = {
|
||||||
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
|
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
|
||||||
new StanfordCoreNLP(Properties.forSettings(settings))
|
new StanfordCoreNLP(Properties.forSettings(settings))
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,9 @@ import docspell.common._
|
|||||||
|
|
||||||
object Properties {
|
object Properties {
|
||||||
|
|
||||||
|
def fromMap(m: Map[String, String]): JProps =
|
||||||
|
apply(m.toSeq: _*)
|
||||||
|
|
||||||
def apply(ps: (String, String)*): JProps = {
|
def apply(ps: (String, String)*): JProps = {
|
||||||
val p = new JProps()
|
val p = new JProps()
|
||||||
for ((k, v) <- ps)
|
for ((k, v) <- ps)
|
||||||
@ -14,7 +17,7 @@ object Properties {
|
|||||||
p
|
p
|
||||||
}
|
}
|
||||||
|
|
||||||
def forSettings(settings: StanfordSettings): JProps = {
|
def forSettings(settings: StanfordNerSettings): JProps = {
|
||||||
val regexNerFile = settings.regexNer
|
val regexNerFile = settings.regexNer
|
||||||
.map(p => p.normalize().toAbsolutePath().toString())
|
.map(p => p.normalize().toAbsolutePath().toString())
|
||||||
settings.lang match {
|
settings.lang match {
|
||||||
|
@ -25,7 +25,7 @@ object StanfordNerClassifier {
|
|||||||
def nerAnnotate[F[_]: Applicative](
|
def nerAnnotate[F[_]: Applicative](
|
||||||
cacheKey: String,
|
cacheKey: String,
|
||||||
cache: PipelineCache[F]
|
cache: PipelineCache[F]
|
||||||
)(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
|
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
|
||||||
cache
|
cache
|
||||||
.obtain(cacheKey, settings)
|
.obtain(cacheKey, settings)
|
||||||
.map(crf => runClassifier(crf, text))
|
.map(crf => runClassifier(crf, text))
|
||||||
|
@ -19,4 +19,8 @@ import docspell.common._
|
|||||||
* as a last step to tag untagged tokens using the provided list of
|
* as a last step to tag untagged tokens using the provided list of
|
||||||
* regexps.
|
* regexps.
|
||||||
*/
|
*/
|
||||||
case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
|
case class StanfordNerSettings(
|
||||||
|
lang: Language,
|
||||||
|
highRecall: Boolean,
|
||||||
|
regexNer: Option[Path]
|
||||||
|
)
|
@ -0,0 +1,153 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.effect.concurrent.Ref
|
||||||
|
import cats.implicits._
|
||||||
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.analysis.nlp.TextClassifier._
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
import edu.stanford.nlp.classify.ColumnDataClassifier
|
||||||
|
|
||||||
|
final class StanfordTextClassifier[F[_]: Sync: ContextShift](
|
||||||
|
cfg: TextClassifierConfig,
|
||||||
|
blocker: Blocker
|
||||||
|
) extends TextClassifier[F] {
|
||||||
|
|
||||||
|
def trainClassifier[A](
|
||||||
|
logger: Logger[F],
|
||||||
|
data: Stream[F, Data]
|
||||||
|
)(handler: TextClassifier.Handler[F, A]): F[A] =
|
||||||
|
File
|
||||||
|
.withTempDir(cfg.workingDir, "trainclassifier")
|
||||||
|
.use { dir =>
|
||||||
|
for {
|
||||||
|
rawData <- writeDataFile(blocker, dir, data)
|
||||||
|
_ <- logger.info(s"Learning from ${rawData.count} items.")
|
||||||
|
trainData <- splitData(logger, rawData)
|
||||||
|
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
|
||||||
|
sorted = scores.sortBy(-_.score)
|
||||||
|
res <- handler(sorted.head.model)
|
||||||
|
} yield res
|
||||||
|
}
|
||||||
|
|
||||||
|
def classify(
|
||||||
|
logger: Logger[F],
|
||||||
|
model: ClassifierModel,
|
||||||
|
text: String
|
||||||
|
): F[Option[String]] =
|
||||||
|
Sync[F].delay {
|
||||||
|
val cls = ColumnDataClassifier.getClassifier(
|
||||||
|
model.model.normalize().toAbsolutePath().toString()
|
||||||
|
)
|
||||||
|
val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
|
||||||
|
Option(cat)
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- helpers
|
||||||
|
|
||||||
|
def train(
|
||||||
|
logger: Logger[F],
|
||||||
|
in: TrainData,
|
||||||
|
props: Map[String, String]
|
||||||
|
): F[TrainResult] =
|
||||||
|
for {
|
||||||
|
_ <- logger.debug(s"Training classifier from $props")
|
||||||
|
res <- Sync[F].delay {
|
||||||
|
val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
|
||||||
|
cdc.trainClassifier(in.train.toString())
|
||||||
|
val score = cdc.testClassifier(in.test.toString())
|
||||||
|
TrainResult(score.first(), ClassifierModel(in.modelFile))
|
||||||
|
}
|
||||||
|
_ <- logger.debug(s"Trained with result $res")
|
||||||
|
} yield res
|
||||||
|
|
||||||
|
def splitData(logger: Logger[F], in: RawData): F[TrainData] = {
|
||||||
|
val nTest = (in.count * 0.15).toLong
|
||||||
|
|
||||||
|
val td =
|
||||||
|
TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt"))
|
||||||
|
|
||||||
|
val fileLines =
|
||||||
|
fs2.io.file
|
||||||
|
.readAll(in.file, blocker, 4096)
|
||||||
|
.through(fs2.text.utf8Decode)
|
||||||
|
.through(fs2.text.lines)
|
||||||
|
|
||||||
|
for {
|
||||||
|
_ <- logger.debug(
|
||||||
|
s"Splitting raw data into test/train data. Testing with $nTest entries"
|
||||||
|
)
|
||||||
|
_ <-
|
||||||
|
fileLines
|
||||||
|
.take(nTest)
|
||||||
|
.intersperse("\n")
|
||||||
|
.through(fs2.text.utf8Encode)
|
||||||
|
.through(fs2.io.file.writeAll(td.test, blocker))
|
||||||
|
.compile
|
||||||
|
.drain
|
||||||
|
_ <-
|
||||||
|
fileLines
|
||||||
|
.drop(nTest)
|
||||||
|
.intersperse("\n")
|
||||||
|
.through(fs2.text.utf8Encode)
|
||||||
|
.through(fs2.io.file.writeAll(td.train, blocker))
|
||||||
|
.compile
|
||||||
|
.drain
|
||||||
|
} yield td
|
||||||
|
}
|
||||||
|
|
||||||
|
def writeDataFile(blocker: Blocker, dir: Path, data: Stream[F, Data]): F[RawData] = {
|
||||||
|
val target = dir.resolve("rawdata")
|
||||||
|
for {
|
||||||
|
counter <- Ref.of[F, Long](0L)
|
||||||
|
_ <-
|
||||||
|
data
|
||||||
|
.filter(_.text.nonEmpty)
|
||||||
|
.map(d => s"${d.cls}\t${fixRef(d.ref)}\t${normalisedText(d.text)}")
|
||||||
|
.evalTap(_ => counter.update(_ + 1))
|
||||||
|
.intersperse("\r\n")
|
||||||
|
.through(fs2.text.utf8Encode)
|
||||||
|
.through(fs2.io.file.writeAll(target, blocker))
|
||||||
|
.compile
|
||||||
|
.drain
|
||||||
|
lines <- counter.get
|
||||||
|
} yield RawData(lines, target)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def normalisedText(text: String): String =
|
||||||
|
text.replaceAll("[\n\r\t]+", " ")
|
||||||
|
|
||||||
|
def fixRef(str: String): String =
|
||||||
|
str.replace('\t', '_')
|
||||||
|
|
||||||
|
def amendProps(
|
||||||
|
trainData: TrainData,
|
||||||
|
props: Map[String, String]
|
||||||
|
): Map[String, String] =
|
||||||
|
prepend("2.", props) ++ Map(
|
||||||
|
"trainFile" -> trainData.train.normalize().toAbsolutePath().toString(),
|
||||||
|
"testFile" -> trainData.test.normalize().toAbsolutePath().toString(),
|
||||||
|
"serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString()
|
||||||
|
).toList
|
||||||
|
|
||||||
|
case class RawData(count: Long, file: Path)
|
||||||
|
case class TrainData(train: Path, test: Path) {
|
||||||
|
val modelFile = train.resolveSibling("model.ser.gz")
|
||||||
|
}
|
||||||
|
|
||||||
|
case class TrainResult(score: Double, model: ClassifierModel)
|
||||||
|
|
||||||
|
def prepend(pre: String, data: Map[String, String]): Map[String, String] =
|
||||||
|
data.toList
|
||||||
|
.map({
|
||||||
|
case (k, v) =>
|
||||||
|
if (k.startsWith(pre)) (k, v)
|
||||||
|
else (pre + k, v)
|
||||||
|
})
|
||||||
|
.toMap
|
||||||
|
}
|
@ -0,0 +1,25 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import cats.data.Kleisli
|
||||||
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.analysis.nlp.TextClassifier.Data
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
trait TextClassifier[F[_]] {
|
||||||
|
|
||||||
|
def trainClassifier[A](logger: Logger[F], data: Stream[F, Data])(
|
||||||
|
handler: TextClassifier.Handler[F, A]
|
||||||
|
): F[A]
|
||||||
|
|
||||||
|
def classify(logger: Logger[F], model: ClassifierModel, text: String): F[Option[String]]
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object TextClassifier {
|
||||||
|
|
||||||
|
type Handler[F[_], A] = Kleisli[F, ClassifierModel, A]
|
||||||
|
|
||||||
|
case class Data(cls: String, ref: String, text: String)
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,10 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
|
||||||
|
case class TextClassifierConfig(
|
||||||
|
workingDir: Path,
|
||||||
|
classifierConfigs: NonEmptyList[Map[String, String]]
|
||||||
|
)
|
BIN
modules/analysis/src/test/resources/test.ser.gz
Normal file
BIN
modules/analysis/src/test/resources/test.ser.gz
Normal file
Binary file not shown.
@ -0,0 +1,76 @@
|
|||||||
|
package docspell.analysis.nlp
|
||||||
|
|
||||||
|
import minitest._
|
||||||
|
import cats.effect._
|
||||||
|
import scala.concurrent.ExecutionContext
|
||||||
|
import java.nio.file.Paths
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
import docspell.common._
|
||||||
|
import fs2.Stream
|
||||||
|
import cats.data.Kleisli
|
||||||
|
import TextClassifier.Data
|
||||||
|
|
||||||
|
object StanfordTextClassifierSuite extends SimpleTestSuite {
|
||||||
|
val logger = Logger.log4s[IO](org.log4s.getLogger)
|
||||||
|
|
||||||
|
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||||
|
|
||||||
|
test("learn from data") {
|
||||||
|
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
|
||||||
|
|
||||||
|
val data =
|
||||||
|
Stream
|
||||||
|
.emit(Data("invoice", "n", "this is your invoice total $421"))
|
||||||
|
.repeat
|
||||||
|
.take(10)
|
||||||
|
.zip(
|
||||||
|
Stream
|
||||||
|
.emit(Data("receipt", "n", "shopping receipt cheese cake bar"))
|
||||||
|
.repeat
|
||||||
|
.take(10)
|
||||||
|
)
|
||||||
|
.flatMap({
|
||||||
|
case (a, b) =>
|
||||||
|
Stream.emits(Seq(a, b))
|
||||||
|
})
|
||||||
|
.covary[IO]
|
||||||
|
|
||||||
|
val modelExists =
|
||||||
|
Blocker[IO].use { blocker =>
|
||||||
|
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
|
||||||
|
classifier.trainClassifier[Boolean](logger, data)(
|
||||||
|
Kleisli(result => File.existsNonEmpty[IO](result.model))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
assertEquals(modelExists.unsafeRunSync(), true)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("run classifier") {
|
||||||
|
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
|
||||||
|
val things = for {
|
||||||
|
dir <- File.withTempDir[IO](Paths.get("target"), "testcls")
|
||||||
|
blocker <- Blocker[IO]
|
||||||
|
} yield (dir, blocker)
|
||||||
|
|
||||||
|
things
|
||||||
|
.use {
|
||||||
|
case (dir, blocker) =>
|
||||||
|
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
|
||||||
|
|
||||||
|
val modelFile = dir.resolve("test.ser.gz")
|
||||||
|
for {
|
||||||
|
_ <-
|
||||||
|
LenientUri
|
||||||
|
.fromJava(getClass.getResource("/test.ser.gz"))
|
||||||
|
.readURL[IO](4096, blocker)
|
||||||
|
.through(fs2.io.file.writeAll(modelFile, blocker))
|
||||||
|
.compile
|
||||||
|
.drain
|
||||||
|
model = ClassifierModel(modelFile)
|
||||||
|
cat <- classifier.classify(logger, model, "there is receipt always")
|
||||||
|
_ = assertEquals(cat, Some("receipt"))
|
||||||
|
} yield ()
|
||||||
|
}
|
||||||
|
.unsafeRunSync()
|
||||||
|
}
|
||||||
|
}
|
@ -52,12 +52,12 @@ object BackendApp {
|
|||||||
queue <- JobQueue(store)
|
queue <- JobQueue(store)
|
||||||
loginImpl <- Login[F](store)
|
loginImpl <- Login[F](store)
|
||||||
signupImpl <- OSignup[F](store)
|
signupImpl <- OSignup[F](store)
|
||||||
collImpl <- OCollective[F](store)
|
joexImpl <- OJoex(JoexClient(httpClient), store)
|
||||||
|
collImpl <- OCollective[F](store, utStore, queue, joexImpl)
|
||||||
sourceImpl <- OSource[F](store)
|
sourceImpl <- OSource[F](store)
|
||||||
tagImpl <- OTag[F](store)
|
tagImpl <- OTag[F](store)
|
||||||
equipImpl <- OEquipment[F](store)
|
equipImpl <- OEquipment[F](store)
|
||||||
orgImpl <- OOrganization(store)
|
orgImpl <- OOrganization(store)
|
||||||
joexImpl <- OJoex(JoexClient(httpClient), store)
|
|
||||||
uploadImpl <- OUpload(store, queue, cfg.files, joexImpl)
|
uploadImpl <- OUpload(store, queue, cfg.files, joexImpl)
|
||||||
nodeImpl <- ONode(store)
|
nodeImpl <- ONode(store)
|
||||||
jobImpl <- OJob(store, joexImpl)
|
jobImpl <- OJob(store, joexImpl)
|
||||||
|
@ -8,14 +8,21 @@ import docspell.backend.PasswordCrypt
|
|||||||
import docspell.backend.ops.OCollective._
|
import docspell.backend.ops.OCollective._
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.store.queries.QCollective
|
import docspell.store.queries.QCollective
|
||||||
|
import docspell.store.queue.JobQueue
|
||||||
import docspell.store.records._
|
import docspell.store.records._
|
||||||
|
import docspell.store.usertask.UserTask
|
||||||
|
import docspell.store.usertask.UserTaskStore
|
||||||
import docspell.store.{AddResult, Store}
|
import docspell.store.{AddResult, Store}
|
||||||
|
|
||||||
|
import com.github.eikek.calev._
|
||||||
|
|
||||||
trait OCollective[F[_]] {
|
trait OCollective[F[_]] {
|
||||||
|
|
||||||
def find(name: Ident): F[Option[RCollective]]
|
def find(name: Ident): F[Option[RCollective]]
|
||||||
|
|
||||||
def updateSettings(collective: Ident, lang: OCollective.Settings): F[AddResult]
|
def updateSettings(collective: Ident, settings: OCollective.Settings): F[AddResult]
|
||||||
|
|
||||||
|
def findSettings(collective: Ident): F[Option[OCollective.Settings]]
|
||||||
|
|
||||||
def listUser(collective: Ident): F[Vector[RUser]]
|
def listUser(collective: Ident): F[Vector[RUser]]
|
||||||
|
|
||||||
@ -43,6 +50,7 @@ trait OCollective[F[_]] {
|
|||||||
|
|
||||||
def findEnabledSource(sourceId: Ident): F[Option[RSource]]
|
def findEnabledSource(sourceId: Ident): F[Option[RSource]]
|
||||||
|
|
||||||
|
def startLearnClassifier(collective: Ident): F[Unit]
|
||||||
}
|
}
|
||||||
|
|
||||||
object OCollective {
|
object OCollective {
|
||||||
@ -55,6 +63,8 @@ object OCollective {
|
|||||||
|
|
||||||
type Settings = RCollective.Settings
|
type Settings = RCollective.Settings
|
||||||
val Settings = RCollective.Settings
|
val Settings = RCollective.Settings
|
||||||
|
type Classifier = RClassifierSetting.Classifier
|
||||||
|
val Classifier = RClassifierSetting.Classifier
|
||||||
|
|
||||||
sealed trait PassChangeResult
|
sealed trait PassChangeResult
|
||||||
object PassChangeResult {
|
object PassChangeResult {
|
||||||
@ -91,7 +101,12 @@ object OCollective {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def apply[F[_]: Effect](store: Store[F]): Resource[F, OCollective[F]] =
|
def apply[F[_]: Effect](
|
||||||
|
store: Store[F],
|
||||||
|
uts: UserTaskStore[F],
|
||||||
|
queue: JobQueue[F],
|
||||||
|
joex: OJoex[F]
|
||||||
|
): Resource[F, OCollective[F]] =
|
||||||
Resource.pure[F, OCollective[F]](new OCollective[F] {
|
Resource.pure[F, OCollective[F]](new OCollective[F] {
|
||||||
def find(name: Ident): F[Option[RCollective]] =
|
def find(name: Ident): F[Option[RCollective]] =
|
||||||
store.transact(RCollective.findById(name))
|
store.transact(RCollective.findById(name))
|
||||||
@ -101,6 +116,41 @@ object OCollective {
|
|||||||
.transact(RCollective.updateSettings(collective, sett))
|
.transact(RCollective.updateSettings(collective, sett))
|
||||||
.attempt
|
.attempt
|
||||||
.map(AddResult.fromUpdate)
|
.map(AddResult.fromUpdate)
|
||||||
|
.flatMap(res => updateLearnClassifierTask(collective, sett) *> res.pure[F])
|
||||||
|
|
||||||
|
def updateLearnClassifierTask(coll: Ident, sett: Settings) =
|
||||||
|
for {
|
||||||
|
id <- Ident.randomId[F]
|
||||||
|
on = sett.classifier.map(_.enabled).getOrElse(false)
|
||||||
|
timer = sett.classifier.map(_.schedule).getOrElse(CalEvent.unsafe(""))
|
||||||
|
ut = UserTask(
|
||||||
|
id,
|
||||||
|
LearnClassifierArgs.taskName,
|
||||||
|
on,
|
||||||
|
timer,
|
||||||
|
LearnClassifierArgs(coll)
|
||||||
|
)
|
||||||
|
_ <- uts.updateOneTask(AccountId(coll, LearnClassifierArgs.taskName), ut)
|
||||||
|
_ <- joex.notifyAllNodes
|
||||||
|
} yield ()
|
||||||
|
|
||||||
|
def startLearnClassifier(collective: Ident): F[Unit] =
|
||||||
|
for {
|
||||||
|
id <- Ident.randomId[F]
|
||||||
|
ut <- UserTask(
|
||||||
|
id,
|
||||||
|
LearnClassifierArgs.taskName,
|
||||||
|
true,
|
||||||
|
CalEvent(WeekdayComponent.All, DateEvent.All, TimeEvent.All),
|
||||||
|
LearnClassifierArgs(collective)
|
||||||
|
).encode.toPeriodicTask(AccountId(collective, LearnClassifierArgs.taskName))
|
||||||
|
job <- ut.toJob
|
||||||
|
_ <- queue.insert(job)
|
||||||
|
_ <- joex.notifyAllNodes
|
||||||
|
} yield ()
|
||||||
|
|
||||||
|
def findSettings(collective: Ident): F[Option[OCollective.Settings]] =
|
||||||
|
store.transact(RCollective.getSettings(collective))
|
||||||
|
|
||||||
def listUser(collective: Ident): F[Vector[RUser]] =
|
def listUser(collective: Ident): F[Vector[RUser]] =
|
||||||
store.transact(RUser.findAll(collective, _.login))
|
store.transact(RUser.findAll(collective, _.login))
|
||||||
|
@ -0,0 +1,35 @@
|
|||||||
|
package docspell.common
|
||||||
|
|
||||||
|
import docspell.common.syntax.all._
|
||||||
|
|
||||||
|
import io.circe._
|
||||||
|
import io.circe.generic.semiauto._
|
||||||
|
|
||||||
|
/** Arguments to the classify-item task.
|
||||||
|
*
|
||||||
|
* This task is run periodically and learns from existing documents
|
||||||
|
* to create a model for predicting tags of new documents. The user
|
||||||
|
* must give a tag category as a subset of possible tags..
|
||||||
|
*/
|
||||||
|
case class LearnClassifierArgs(
|
||||||
|
collective: Ident
|
||||||
|
) {
|
||||||
|
|
||||||
|
def makeSubject: String =
|
||||||
|
"Learn tags"
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
object LearnClassifierArgs {
|
||||||
|
|
||||||
|
val taskName = Ident.unsafe("learn-classifier")
|
||||||
|
|
||||||
|
implicit val jsonEncoder: Encoder[LearnClassifierArgs] =
|
||||||
|
deriveEncoder[LearnClassifierArgs]
|
||||||
|
implicit val jsonDecoder: Decoder[LearnClassifierArgs] =
|
||||||
|
deriveDecoder[LearnClassifierArgs]
|
||||||
|
|
||||||
|
def parse(str: String): Either[Throwable, LearnClassifierArgs] =
|
||||||
|
str.parseJsonAs[LearnClassifierArgs]
|
||||||
|
|
||||||
|
}
|
@ -271,6 +271,50 @@ docspell.joex {
|
|||||||
# file will be kept until a check for a state change is done.
|
# file will be kept until a check for a state change is done.
|
||||||
file-cache-time = "1 minute"
|
file-cache-time = "1 minute"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Settings for doing document classification.
|
||||||
|
#
|
||||||
|
# This works by learning from existing documents. A collective can
|
||||||
|
# specify a tag category and the system will try to predict a tag
|
||||||
|
# from this category for new incoming documents.
|
||||||
|
#
|
||||||
|
# This requires a satstical model that is computed from all
|
||||||
|
# existing documents. This process is run periodically as
|
||||||
|
# configured by the collective. It may require a lot of memory,
|
||||||
|
# depending on the amount of data.
|
||||||
|
#
|
||||||
|
# It utilises this NLP library: https://nlp.stanford.edu/.
|
||||||
|
classification {
|
||||||
|
# Whether to enable classification globally. Each collective can
|
||||||
|
# decide to disable it. If it is disabled here, no collective
|
||||||
|
# can use classification.
|
||||||
|
enabled = true
|
||||||
|
|
||||||
|
# If concerned with memory consumption, this restricts the
|
||||||
|
# number of items to consider. More are better for training. A
|
||||||
|
# negative value or zero means no train on all items.
|
||||||
|
item-count = 0
|
||||||
|
|
||||||
|
# These settings are used to configure the classifier. If
|
||||||
|
# multiple are given, they are all tried and the "best" is
|
||||||
|
# chosen at the end. See
|
||||||
|
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
|
||||||
|
# for more info about these settings. The settings here yielded
|
||||||
|
# good results with *my* dataset.
|
||||||
|
#
|
||||||
|
# Enclose regexps in triple quotes.
|
||||||
|
classifiers = [
|
||||||
|
{ "useSplitWords" = "true"
|
||||||
|
"splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
|
||||||
|
"splitWordsIgnoreRegexp" = """\s+"""
|
||||||
|
"useSplitPrefixSuffixNGrams" = "true"
|
||||||
|
"maxNGramLeng" = "4"
|
||||||
|
"minNGramLeng" = "1"
|
||||||
|
"splitWordShape" = "chris4"
|
||||||
|
"intern" = "true" # makes it slower but saves memory
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Configuration for converting files into PDFs.
|
# Configuration for converting files into PDFs.
|
||||||
|
@ -2,7 +2,10 @@ package docspell.joex
|
|||||||
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
|
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
|
||||||
import docspell.analysis.TextAnalysisConfig
|
import docspell.analysis.TextAnalysisConfig
|
||||||
|
import docspell.analysis.nlp.TextClassifierConfig
|
||||||
import docspell.backend.Config.Files
|
import docspell.backend.Config.Files
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.convert.ConvertConfig
|
import docspell.convert.ConvertConfig
|
||||||
@ -57,15 +60,30 @@ object Config {
|
|||||||
case class TextAnalysis(
|
case class TextAnalysis(
|
||||||
maxLength: Int,
|
maxLength: Int,
|
||||||
workingDir: Path,
|
workingDir: Path,
|
||||||
regexNer: RegexNer
|
regexNer: RegexNer,
|
||||||
|
classification: Classification
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def textAnalysisConfig: TextAnalysisConfig =
|
def textAnalysisConfig: TextAnalysisConfig =
|
||||||
TextAnalysisConfig(maxLength)
|
TextAnalysisConfig(
|
||||||
|
maxLength,
|
||||||
|
TextClassifierConfig(
|
||||||
|
workingDir,
|
||||||
|
NonEmptyList
|
||||||
|
.fromList(classification.classifiers)
|
||||||
|
.getOrElse(NonEmptyList.of(Map.empty))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def regexNerFileConfig: RegexNerFile.Config =
|
def regexNerFileConfig: RegexNerFile.Config =
|
||||||
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
|
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
|
||||||
}
|
}
|
||||||
|
|
||||||
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
|
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
|
||||||
|
|
||||||
|
case class Classification(
|
||||||
|
enabled: Boolean,
|
||||||
|
itemCount: Int,
|
||||||
|
classifiers: List[Map[String, String]]
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -14,6 +14,7 @@ import docspell.ftssolr.SolrFtsClient
|
|||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.fts.{MigrationTask, ReIndexTask}
|
import docspell.joex.fts.{MigrationTask, ReIndexTask}
|
||||||
import docspell.joex.hk._
|
import docspell.joex.hk._
|
||||||
|
import docspell.joex.learn.LearnClassifierTask
|
||||||
import docspell.joex.notify._
|
import docspell.joex.notify._
|
||||||
import docspell.joex.pdfconv.ConvertAllPdfTask
|
import docspell.joex.pdfconv.ConvertAllPdfTask
|
||||||
import docspell.joex.pdfconv.PdfConvTask
|
import docspell.joex.pdfconv.PdfConvTask
|
||||||
@ -159,6 +160,13 @@ object JoexAppImpl {
|
|||||||
ConvertAllPdfTask.onCancel[F]
|
ConvertAllPdfTask.onCancel[F]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
.withTask(
|
||||||
|
JobTask.json(
|
||||||
|
LearnClassifierArgs.taskName,
|
||||||
|
LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser),
|
||||||
|
LearnClassifierTask.onCancel[F]
|
||||||
|
)
|
||||||
|
)
|
||||||
.resource
|
.resource
|
||||||
psch <- PeriodicScheduler.create(
|
psch <- PeriodicScheduler.create(
|
||||||
cfg.periodicScheduler,
|
cfg.periodicScheduler,
|
||||||
|
@ -0,0 +1,111 @@
|
|||||||
|
package docspell.joex.learn
|
||||||
|
|
||||||
|
import cats.data.Kleisli
|
||||||
|
import cats.data.OptionT
|
||||||
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
|
import fs2.{Pipe, Stream}
|
||||||
|
|
||||||
|
import docspell.analysis.TextAnalyser
|
||||||
|
import docspell.analysis.nlp.ClassifierModel
|
||||||
|
import docspell.analysis.nlp.TextClassifier.Data
|
||||||
|
import docspell.backend.ops.OCollective
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.joex.Config
|
||||||
|
import docspell.joex.scheduler._
|
||||||
|
import docspell.store.queries.QItem
|
||||||
|
import docspell.store.records.RClassifierSetting
|
||||||
|
|
||||||
|
import bitpeace.MimetypeHint
|
||||||
|
|
||||||
|
object LearnClassifierTask {
|
||||||
|
val noClass = "__NONE__"
|
||||||
|
val pageSep = " --n-- "
|
||||||
|
|
||||||
|
type Args = LearnClassifierArgs
|
||||||
|
|
||||||
|
def onCancel[F[_]: Sync]: Task[F, Args, Unit] =
|
||||||
|
Task.log(_.warn("Cancelling learn-classifier task"))
|
||||||
|
|
||||||
|
def apply[F[_]: Sync: ContextShift](
|
||||||
|
cfg: Config.TextAnalysis,
|
||||||
|
blocker: Blocker,
|
||||||
|
analyser: TextAnalyser[F]
|
||||||
|
): Task[F, Args, Unit] =
|
||||||
|
Task { ctx =>
|
||||||
|
(for {
|
||||||
|
sett <- findActiveSettings[F](ctx, cfg)
|
||||||
|
data = selectItems(
|
||||||
|
ctx,
|
||||||
|
math.min(cfg.classification.itemCount, sett.itemCount).toLong,
|
||||||
|
sett.category.getOrElse("")
|
||||||
|
)
|
||||||
|
_ <- OptionT.liftF(
|
||||||
|
analyser
|
||||||
|
.classifier(blocker)
|
||||||
|
.trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker)))
|
||||||
|
)
|
||||||
|
} yield ())
|
||||||
|
.getOrElseF(logInactiveWarning(ctx.logger))
|
||||||
|
}
|
||||||
|
|
||||||
|
private def handleModel[F[_]: Sync: ContextShift](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
blocker: Blocker
|
||||||
|
)(trainedModel: ClassifierModel): F[Unit] =
|
||||||
|
for {
|
||||||
|
oldFile <- ctx.store.transact(
|
||||||
|
RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId))
|
||||||
|
)
|
||||||
|
_ <- ctx.logger.info("Storing new trained model")
|
||||||
|
fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096)
|
||||||
|
newFile <-
|
||||||
|
ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
|
||||||
|
_ <- ctx.store.transact(
|
||||||
|
RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id))
|
||||||
|
)
|
||||||
|
_ <- ctx.logger.debug(s"New model stored at file ${newFile.id}")
|
||||||
|
_ <- oldFile match {
|
||||||
|
case Some(fid) =>
|
||||||
|
ctx.logger.debug(s"Deleting old model file ${fid.id}") *>
|
||||||
|
ctx.store.bitpeace.delete(fid.id).compile.drain
|
||||||
|
case None => ().pure[F]
|
||||||
|
}
|
||||||
|
} yield ()
|
||||||
|
|
||||||
|
private def selectItems[F[_]](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
max: Long,
|
||||||
|
category: String
|
||||||
|
): Stream[F, Data] = {
|
||||||
|
val connStream =
|
||||||
|
for {
|
||||||
|
item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
|
||||||
|
tt <- Stream.eval(
|
||||||
|
QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
|
||||||
|
)
|
||||||
|
} yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
|
||||||
|
ctx.store.transact(connStream.filter(_.text.nonEmpty))
|
||||||
|
}
|
||||||
|
|
||||||
|
private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] =
|
||||||
|
if (max <= 0) identity
|
||||||
|
else _.take(max)
|
||||||
|
|
||||||
|
private def findActiveSettings[F[_]: Sync](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
cfg: Config.TextAnalysis
|
||||||
|
): OptionT[F, OCollective.Classifier] =
|
||||||
|
if (cfg.classification.enabled)
|
||||||
|
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
|
||||||
|
.filter(_.enabled)
|
||||||
|
.filter(_.category.nonEmpty)
|
||||||
|
.map(OCollective.Classifier.fromRecord)
|
||||||
|
else
|
||||||
|
OptionT.none
|
||||||
|
|
||||||
|
private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
|
||||||
|
logger.warn(
|
||||||
|
"Classification is disabled. Check joex config and the collective settings."
|
||||||
|
)
|
||||||
|
}
|
@ -38,6 +38,9 @@ case class ItemData(
|
|||||||
copy(metas = next)
|
copy(metas = next)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def appendTags(tags: Seq[String]): ItemData =
|
||||||
|
copy(tags = (this.tags ++ tags.toList).distinct)
|
||||||
|
|
||||||
def changeMeta(
|
def changeMeta(
|
||||||
attachId: Ident,
|
attachId: Ident,
|
||||||
f: RAttachmentMeta => RAttachmentMeta
|
f: RAttachmentMeta => RAttachmentMeta
|
||||||
|
@ -34,12 +34,12 @@ object ProcessItem {
|
|||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
|
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
|
||||||
|
|
||||||
def analysisOnly[F[_]: Sync](
|
def analysisOnly[F[_]: Sync: ContextShift](
|
||||||
cfg: Config,
|
cfg: Config,
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
regexNer: RegexNerFile[F]
|
regexNer: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
TextAnalysis[F](analyser, regexNer)(item)
|
TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item)
|
||||||
.flatMap(FindProposal[F](cfg.processing))
|
.flatMap(FindProposal[F](cfg.processing))
|
||||||
.flatMap(EvalProposals[F])
|
.flatMap(EvalProposals[F])
|
||||||
.flatMap(SaveProposals[F])
|
.flatMap(SaveProposals[F])
|
||||||
|
@ -1,23 +1,33 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
|
import cats.data.OptionT
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.analysis.TextAnalyser
|
import docspell.analysis.TextAnalyser
|
||||||
import docspell.analysis.nlp.StanfordSettings
|
import docspell.analysis.nlp.ClassifierModel
|
||||||
|
import docspell.analysis.nlp.StanfordNerSettings
|
||||||
|
import docspell.analysis.nlp.TextClassifier
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.joex.Config
|
||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
|
import docspell.joex.learn.LearnClassifierTask
|
||||||
import docspell.joex.process.ItemData.AttachmentDates
|
import docspell.joex.process.ItemData.AttachmentDates
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.records.RAttachmentMeta
|
import docspell.store.records.RAttachmentMeta
|
||||||
|
import docspell.store.records.RClassifierSetting
|
||||||
|
|
||||||
|
import bitpeace.RangeDef
|
||||||
|
|
||||||
object TextAnalysis {
|
object TextAnalysis {
|
||||||
|
type Args = ProcessItemArgs
|
||||||
|
|
||||||
def apply[F[_]: Sync](
|
def apply[F[_]: Sync: ContextShift](
|
||||||
|
cfg: Config.TextAnalysis,
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
nerFile: RegexNerFile[F]
|
nerFile: RegexNerFile[F]
|
||||||
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
)(item: ItemData): Task[F, Args, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
for {
|
for {
|
||||||
_ <- ctx.logger.info("Starting text analysis")
|
_ <- ctx.logger.info("Starting text analysis")
|
||||||
@ -34,15 +44,18 @@ object TextAnalysis {
|
|||||||
e <- s
|
e <- s
|
||||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||||
v = t.toVector
|
v = t.toVector
|
||||||
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value
|
||||||
|
} yield item
|
||||||
|
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||||
|
.appendTags(tag.toSeq)
|
||||||
}
|
}
|
||||||
|
|
||||||
def annotateAttachment[F[_]: Sync](
|
def annotateAttachment[F[_]: Sync](
|
||||||
ctx: Context[F, ProcessItemArgs],
|
ctx: Context[F, Args],
|
||||||
analyser: TextAnalyser[F],
|
analyser: TextAnalyser[F],
|
||||||
nerFile: RegexNerFile[F]
|
nerFile: RegexNerFile[F]
|
||||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
|
||||||
val settings = StanfordSettings(ctx.args.meta.language, false, None)
|
val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
|
||||||
for {
|
for {
|
||||||
customNer <- nerFile.makeFile(ctx.args.meta.collective)
|
customNer <- nerFile.makeFile(ctx.args.meta.collective)
|
||||||
sett = settings.copy(regexNer = customNer)
|
sett = settings.copy(regexNer = customNer)
|
||||||
@ -54,4 +67,42 @@ object TextAnalysis {
|
|||||||
)
|
)
|
||||||
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def predictTag[F[_]: Sync: ContextShift](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
cfg: Config.TextAnalysis,
|
||||||
|
metas: Vector[RAttachmentMeta],
|
||||||
|
classifier: TextClassifier[F]
|
||||||
|
): OptionT[F, String] =
|
||||||
|
for {
|
||||||
|
model <- findActiveModel(ctx, cfg)
|
||||||
|
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
|
||||||
|
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||||
|
modelData =
|
||||||
|
ctx.store.bitpeace
|
||||||
|
.get(model.id)
|
||||||
|
.unNoneTerminate
|
||||||
|
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
|
||||||
|
cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
|
||||||
|
val modelFile = dir.resolve("model.ser.gz")
|
||||||
|
modelData
|
||||||
|
.through(fs2.io.file.writeAll(modelFile, ctx.blocker))
|
||||||
|
.compile
|
||||||
|
.drain
|
||||||
|
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
|
||||||
|
}).filter(_ != LearnClassifierTask.noClass)
|
||||||
|
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
|
||||||
|
} yield cls
|
||||||
|
|
||||||
|
private def findActiveModel[F[_]: Sync](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
cfg: Config.TextAnalysis
|
||||||
|
): OptionT[F, Ident] =
|
||||||
|
if (cfg.classification.enabled)
|
||||||
|
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
|
||||||
|
.filter(_.enabled)
|
||||||
|
.mapFilter(_.fileId)
|
||||||
|
else
|
||||||
|
OptionT.none
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1047,6 +1047,28 @@ paths:
|
|||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: "#/components/schemas/ContactList"
|
$ref: "#/components/schemas/ContactList"
|
||||||
|
|
||||||
|
/sec/collective/classifier/startonce:
|
||||||
|
post:
|
||||||
|
tags: [ Collective ]
|
||||||
|
summary: Starts the learn-classifier task
|
||||||
|
description: |
|
||||||
|
If the collective has classification enabled, this will submit
|
||||||
|
the task for learning a classifier from existing data. This
|
||||||
|
task is usally run periodically as determined by the
|
||||||
|
collective settings.
|
||||||
|
|
||||||
|
The request is empty, settings are used from the collective.
|
||||||
|
security:
|
||||||
|
- authTokenHeader: []
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: Ok
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: "#/components/schemas/BasicResult"
|
||||||
|
|
||||||
/sec/user:
|
/sec/user:
|
||||||
get:
|
get:
|
||||||
tags: [ Collective ]
|
tags: [ Collective ]
|
||||||
@ -3643,12 +3665,14 @@ components:
|
|||||||
description: DateTime
|
description: DateTime
|
||||||
type: integer
|
type: integer
|
||||||
format: date-time
|
format: date-time
|
||||||
|
|
||||||
CollectiveSettings:
|
CollectiveSettings:
|
||||||
description: |
|
description: |
|
||||||
Settings for a collective.
|
Settings for a collective.
|
||||||
required:
|
required:
|
||||||
- language
|
- language
|
||||||
- integrationEnabled
|
- integrationEnabled
|
||||||
|
- classifier
|
||||||
properties:
|
properties:
|
||||||
language:
|
language:
|
||||||
type: string
|
type: string
|
||||||
@ -3658,6 +3682,31 @@ components:
|
|||||||
description: |
|
description: |
|
||||||
Whether the collective has the integration endpoint
|
Whether the collective has the integration endpoint
|
||||||
enabled.
|
enabled.
|
||||||
|
classifier:
|
||||||
|
$ref: "#/components/schemas/ClassifierSetting"
|
||||||
|
|
||||||
|
ClassifierSetting:
|
||||||
|
description: |
|
||||||
|
Settings for learning a document classifier.
|
||||||
|
required:
|
||||||
|
- enabled
|
||||||
|
- schedule
|
||||||
|
- itemCount
|
||||||
|
properties:
|
||||||
|
enabled:
|
||||||
|
type: boolean
|
||||||
|
category:
|
||||||
|
type: string
|
||||||
|
itemCount:
|
||||||
|
type: integer
|
||||||
|
format: int32
|
||||||
|
description: |
|
||||||
|
The max. number of items to learn from. The newest items
|
||||||
|
are considered.
|
||||||
|
schedule:
|
||||||
|
type: string
|
||||||
|
format: calevent
|
||||||
|
|
||||||
SourceList:
|
SourceList:
|
||||||
description: |
|
description: |
|
||||||
A list of sources.
|
A list of sources.
|
||||||
|
@ -10,6 +10,7 @@ import docspell.restapi.model._
|
|||||||
import docspell.restserver.conv.Conversions
|
import docspell.restserver.conv.Conversions
|
||||||
import docspell.restserver.http4s._
|
import docspell.restserver.http4s._
|
||||||
|
|
||||||
|
import com.github.eikek.calev.CalEvent
|
||||||
import org.http4s.HttpRoutes
|
import org.http4s.HttpRoutes
|
||||||
import org.http4s.circe.CirceEntityDecoder._
|
import org.http4s.circe.CirceEntityDecoder._
|
||||||
import org.http4s.circe.CirceEntityEncoder._
|
import org.http4s.circe.CirceEntityEncoder._
|
||||||
@ -37,7 +38,18 @@ object CollectiveRoutes {
|
|||||||
case req @ POST -> Root / "settings" =>
|
case req @ POST -> Root / "settings" =>
|
||||||
for {
|
for {
|
||||||
settings <- req.as[CollectiveSettings]
|
settings <- req.as[CollectiveSettings]
|
||||||
sett = OCollective.Settings(settings.language, settings.integrationEnabled)
|
sett = OCollective.Settings(
|
||||||
|
settings.language,
|
||||||
|
settings.integrationEnabled,
|
||||||
|
Some(
|
||||||
|
OCollective.Classifier(
|
||||||
|
settings.classifier.enabled,
|
||||||
|
settings.classifier.schedule,
|
||||||
|
settings.classifier.itemCount,
|
||||||
|
settings.classifier.category
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
res <-
|
res <-
|
||||||
backend.collective
|
backend.collective
|
||||||
.updateSettings(user.account.collective, sett)
|
.updateSettings(user.account.collective, sett)
|
||||||
@ -46,8 +58,21 @@ object CollectiveRoutes {
|
|||||||
|
|
||||||
case GET -> Root / "settings" =>
|
case GET -> Root / "settings" =>
|
||||||
for {
|
for {
|
||||||
collDb <- backend.collective.find(user.account.collective)
|
settDb <- backend.collective.findSettings(user.account.collective)
|
||||||
sett = collDb.map(c => CollectiveSettings(c.language, c.integrationEnabled))
|
sett = settDb.map(c =>
|
||||||
|
CollectiveSettings(
|
||||||
|
c.language,
|
||||||
|
c.integrationEnabled,
|
||||||
|
ClassifierSetting(
|
||||||
|
c.classifier.map(_.enabled).getOrElse(false),
|
||||||
|
c.classifier.flatMap(_.category),
|
||||||
|
c.classifier.map(_.itemCount).getOrElse(0),
|
||||||
|
c.classifier
|
||||||
|
.map(_.schedule)
|
||||||
|
.getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00"))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
resp <- sett.toResponse()
|
resp <- sett.toResponse()
|
||||||
} yield resp
|
} yield resp
|
||||||
|
|
||||||
@ -63,6 +88,12 @@ object CollectiveRoutes {
|
|||||||
resp <- Ok(ContactList(res.map(Conversions.mkContact)))
|
resp <- Ok(ContactList(res.map(Conversions.mkContact)))
|
||||||
} yield resp
|
} yield resp
|
||||||
|
|
||||||
|
case POST -> Root / "classifier" / "startonce" =>
|
||||||
|
for {
|
||||||
|
_ <- backend.collective.startLearnClassifier(user.account.collective)
|
||||||
|
resp <- Ok(BasicResult(true, "Task submitted"))
|
||||||
|
} yield resp
|
||||||
|
|
||||||
case GET -> Root =>
|
case GET -> Root =>
|
||||||
for {
|
for {
|
||||||
collDb <- backend.collective.find(user.account.collective)
|
collDb <- backend.collective.find(user.account.collective)
|
||||||
|
@ -0,0 +1,9 @@
|
|||||||
|
CREATE TABLE `classifier_setting` (
|
||||||
|
`cid` varchar(254) not null primary key,
|
||||||
|
`enabled` boolean not null,
|
||||||
|
`schedule` varchar(254) not null,
|
||||||
|
`category` varchar(254) not null,
|
||||||
|
`file_id` varchar(254),
|
||||||
|
`created` timestamp not null,
|
||||||
|
foreign key (`cid`) references `collective`(`cid`)
|
||||||
|
);
|
@ -0,0 +1,11 @@
|
|||||||
|
CREATE TABLE "classifier_setting" (
|
||||||
|
"cid" varchar(254) not null primary key,
|
||||||
|
"enabled" boolean not null,
|
||||||
|
"schedule" varchar(254) not null,
|
||||||
|
"category" varchar(254) not null,
|
||||||
|
"item_count" int not null,
|
||||||
|
"file_id" varchar(254),
|
||||||
|
"created" timestamp not null,
|
||||||
|
foreign key ("cid") references "collective"("cid"),
|
||||||
|
foreign key ("file_id") references "filemeta"("id")
|
||||||
|
);
|
@ -67,8 +67,8 @@ trait DoobieSyntax {
|
|||||||
Fragment.const(" FROM ") ++ table ++ this.where(where)
|
Fragment.const(" FROM ") ++ table ++ this.where(where)
|
||||||
|
|
||||||
def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment =
|
def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment =
|
||||||
Fragment.const("SELECT DISTINCT(") ++ commas(cols.map(_.f)) ++
|
Fragment.const("SELECT DISTINCT ") ++ commas(cols.map(_.f)) ++
|
||||||
Fragment.const(") FROM ") ++ table ++ this.where(where)
|
Fragment.const(" FROM ") ++ table ++ this.where(where)
|
||||||
|
|
||||||
def selectCount(col: Column, table: Fragment, where: Fragment): Fragment =
|
def selectCount(col: Column, table: Fragment, where: Fragment): Fragment =
|
||||||
Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this
|
Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this
|
||||||
|
@ -7,6 +7,7 @@ import cats.effect.concurrent.Ref
|
|||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import fs2.Stream
|
import fs2.Stream
|
||||||
|
|
||||||
|
import docspell.common.syntax.all._
|
||||||
import docspell.common.{IdRef, _}
|
import docspell.common.{IdRef, _}
|
||||||
import docspell.store.Store
|
import docspell.store.Store
|
||||||
import docspell.store.impl.Implicits._
|
import docspell.store.impl.Implicits._
|
||||||
@ -615,4 +616,75 @@ object QItem {
|
|||||||
.query[NameAndNotes]
|
.query[NameAndNotes]
|
||||||
.streamWithChunkSize(chunkSize)
|
.streamWithChunkSize(chunkSize)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def findAllNewesFirst(
|
||||||
|
collective: Ident,
|
||||||
|
chunkSize: Int
|
||||||
|
): Stream[ConnectionIO, Ident] = {
|
||||||
|
val cols = Seq(RItem.Columns.id)
|
||||||
|
(selectSimple(cols, RItem.table, RItem.Columns.cid.is(collective)) ++
|
||||||
|
orderBy(RItem.Columns.created.desc))
|
||||||
|
.query[Ident]
|
||||||
|
.streamWithChunkSize(chunkSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
case class TagName(id: Ident, name: String)
|
||||||
|
case class TextAndTag(itemId: Ident, text: String, tag: Option[TagName])
|
||||||
|
|
||||||
|
def resolveTextAndTag(
|
||||||
|
collective: Ident,
|
||||||
|
itemId: Ident,
|
||||||
|
tagCategory: String,
|
||||||
|
pageSep: String
|
||||||
|
): ConnectionIO[TextAndTag] = {
|
||||||
|
val aId = RAttachment.Columns.id.prefix("a")
|
||||||
|
val aItem = RAttachment.Columns.itemId.prefix("a")
|
||||||
|
val mId = RAttachmentMeta.Columns.id.prefix("m")
|
||||||
|
val mText = RAttachmentMeta.Columns.content.prefix("m")
|
||||||
|
val tiItem = RTagItem.Columns.itemId.prefix("ti")
|
||||||
|
val tiTag = RTagItem.Columns.tagId.prefix("ti")
|
||||||
|
val tId = RTag.Columns.tid.prefix("t")
|
||||||
|
val tName = RTag.Columns.name.prefix("t")
|
||||||
|
val tCat = RTag.Columns.category.prefix("t")
|
||||||
|
val iId = RItem.Columns.id.prefix("i")
|
||||||
|
val iColl = RItem.Columns.cid.prefix("i")
|
||||||
|
|
||||||
|
val cte = withCTE(
|
||||||
|
"tags" -> selectSimple(
|
||||||
|
Seq(tiItem, tId, tName),
|
||||||
|
RTagItem.table ++ fr"ti INNER JOIN" ++
|
||||||
|
RTag.table ++ fr"t ON" ++ tId.is(tiTag),
|
||||||
|
and(tiItem.is(itemId), tCat.is(tagCategory))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
val cols = Seq(mText, tId, tName)
|
||||||
|
|
||||||
|
val from = RItem.table ++ fr"i INNER JOIN" ++
|
||||||
|
RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ fr"INNER JOIN" ++
|
||||||
|
RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ fr"LEFT JOIN" ++
|
||||||
|
fr"tags t ON" ++ RTagItem.Columns.itemId.prefix("t").is(iId)
|
||||||
|
|
||||||
|
val where =
|
||||||
|
and(
|
||||||
|
iId.is(itemId),
|
||||||
|
iColl.is(collective),
|
||||||
|
mText.isNotNull,
|
||||||
|
mText.isNot("")
|
||||||
|
)
|
||||||
|
|
||||||
|
val q = cte ++ selectDistinct(cols, from, where)
|
||||||
|
for {
|
||||||
|
_ <- logger.ftrace[ConnectionIO](
|
||||||
|
s"query: $q (${itemId.id}, ${collective.id}, ${tagCategory})"
|
||||||
|
)
|
||||||
|
texts <- q.query[(String, Option[TagName])].to[List]
|
||||||
|
_ <- logger.ftrace[ConnectionIO](
|
||||||
|
s"Got ${texts.size} text and tag entries for item ${itemId.id}"
|
||||||
|
)
|
||||||
|
tag = texts.headOption.flatMap(_._2)
|
||||||
|
txt = texts.map(_._1).mkString(pageSep)
|
||||||
|
} yield TextAndTag(itemId, txt, tag)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,113 @@
|
|||||||
|
package docspell.store.records
|
||||||
|
|
||||||
|
import cats.implicits._
|
||||||
|
|
||||||
|
import docspell.common._
|
||||||
|
import docspell.store.impl.Implicits._
|
||||||
|
import docspell.store.impl._
|
||||||
|
|
||||||
|
import com.github.eikek.calev._
|
||||||
|
import doobie._
|
||||||
|
import doobie.implicits._
|
||||||
|
|
||||||
|
case class RClassifierSetting(
|
||||||
|
cid: Ident,
|
||||||
|
enabled: Boolean,
|
||||||
|
schedule: CalEvent,
|
||||||
|
category: String,
|
||||||
|
itemCount: Int,
|
||||||
|
fileId: Option[Ident],
|
||||||
|
created: Timestamp
|
||||||
|
) {}
|
||||||
|
|
||||||
|
object RClassifierSetting {
|
||||||
|
|
||||||
|
val table = fr"classifier_setting"
|
||||||
|
|
||||||
|
object Columns {
|
||||||
|
val cid = Column("cid")
|
||||||
|
val enabled = Column("enabled")
|
||||||
|
val schedule = Column("schedule")
|
||||||
|
val category = Column("category")
|
||||||
|
val itemCount = Column("item_count")
|
||||||
|
val fileId = Column("file_id")
|
||||||
|
val created = Column("created")
|
||||||
|
val all = List(cid, enabled, schedule, category, itemCount, fileId, created)
|
||||||
|
}
|
||||||
|
import Columns._
|
||||||
|
|
||||||
|
def insert(v: RClassifierSetting): ConnectionIO[Int] = {
|
||||||
|
val sql =
|
||||||
|
insertRow(
|
||||||
|
table,
|
||||||
|
all,
|
||||||
|
fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}"
|
||||||
|
)
|
||||||
|
sql.update.run
|
||||||
|
}
|
||||||
|
|
||||||
|
def updateAll(v: RClassifierSetting): ConnectionIO[Int] = {
|
||||||
|
val sql = updateRow(
|
||||||
|
table,
|
||||||
|
cid.is(v.cid),
|
||||||
|
commas(
|
||||||
|
enabled.setTo(v.enabled),
|
||||||
|
schedule.setTo(v.schedule),
|
||||||
|
category.setTo(v.category),
|
||||||
|
itemCount.setTo(v.itemCount),
|
||||||
|
fileId.setTo(v.fileId)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
sql.update.run
|
||||||
|
}
|
||||||
|
|
||||||
|
def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] =
|
||||||
|
updateRow(table, cid.is(coll), fileId.setTo(fid)).update.run
|
||||||
|
|
||||||
|
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
|
||||||
|
for {
|
||||||
|
n1 <- updateRow(
|
||||||
|
table,
|
||||||
|
cid.is(v.cid),
|
||||||
|
commas(
|
||||||
|
enabled.setTo(v.enabled),
|
||||||
|
schedule.setTo(v.schedule),
|
||||||
|
itemCount.setTo(v.itemCount),
|
||||||
|
category.setTo(v.category)
|
||||||
|
)
|
||||||
|
).update.run
|
||||||
|
n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
|
||||||
|
} yield n1 + n2
|
||||||
|
|
||||||
|
def findById(id: Ident): ConnectionIO[Option[RClassifierSetting]] = {
|
||||||
|
val sql = selectSimple(all, table, cid.is(id))
|
||||||
|
sql.query[RClassifierSetting].option
|
||||||
|
}
|
||||||
|
|
||||||
|
def delete(coll: Ident): ConnectionIO[Int] =
|
||||||
|
deleteFrom(table, cid.is(coll)).update.run
|
||||||
|
|
||||||
|
case class Classifier(
|
||||||
|
enabled: Boolean,
|
||||||
|
schedule: CalEvent,
|
||||||
|
itemCount: Int,
|
||||||
|
category: Option[String]
|
||||||
|
) {
|
||||||
|
|
||||||
|
def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
|
||||||
|
RClassifierSetting(
|
||||||
|
coll,
|
||||||
|
enabled,
|
||||||
|
schedule,
|
||||||
|
category.getOrElse(""),
|
||||||
|
itemCount,
|
||||||
|
None,
|
||||||
|
created
|
||||||
|
)
|
||||||
|
}
|
||||||
|
object Classifier {
|
||||||
|
def fromRecord(r: RClassifierSetting): Classifier =
|
||||||
|
Classifier(r.enabled, r.schedule, r.itemCount, r.category.some)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -61,14 +61,47 @@ object RCollective {
|
|||||||
updateRow(table, id.is(cid), language.setTo(lang)).update.run
|
updateRow(table, id.is(cid), language.setTo(lang)).update.run
|
||||||
|
|
||||||
def updateSettings(cid: Ident, settings: Settings): ConnectionIO[Int] =
|
def updateSettings(cid: Ident, settings: Settings): ConnectionIO[Int] =
|
||||||
updateRow(
|
for {
|
||||||
table,
|
n1 <- updateRow(
|
||||||
id.is(cid),
|
table,
|
||||||
commas(
|
id.is(cid),
|
||||||
language.setTo(settings.language),
|
commas(
|
||||||
integration.setTo(settings.integrationEnabled)
|
language.setTo(settings.language),
|
||||||
)
|
integration.setTo(settings.integrationEnabled)
|
||||||
).update.run
|
)
|
||||||
|
).update.run
|
||||||
|
cls <-
|
||||||
|
Timestamp
|
||||||
|
.current[ConnectionIO]
|
||||||
|
.map(now => settings.classifier.map(_.toRecord(cid, now)))
|
||||||
|
n2 <- cls match {
|
||||||
|
case Some(cr) =>
|
||||||
|
RClassifierSetting.updateSettings(cr)
|
||||||
|
case None =>
|
||||||
|
RClassifierSetting.delete(cid)
|
||||||
|
}
|
||||||
|
} yield n1 + n2
|
||||||
|
|
||||||
|
def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
|
||||||
|
val cId = id.prefix("c")
|
||||||
|
val CS = RClassifierSetting.Columns
|
||||||
|
val csCid = CS.cid.prefix("cs")
|
||||||
|
|
||||||
|
val cols = Seq(
|
||||||
|
language.prefix("c"),
|
||||||
|
integration.prefix("c"),
|
||||||
|
CS.enabled.prefix("cs"),
|
||||||
|
CS.schedule.prefix("cs"),
|
||||||
|
CS.itemCount.prefix("cs"),
|
||||||
|
CS.category.prefix("cs")
|
||||||
|
)
|
||||||
|
val from = table ++ fr"c LEFT JOIN" ++
|
||||||
|
RClassifierSetting.table ++ fr"cs ON" ++ csCid.is(cId)
|
||||||
|
|
||||||
|
selectSimple(cols, from, cId.is(coll))
|
||||||
|
.query[Settings]
|
||||||
|
.option
|
||||||
|
}
|
||||||
|
|
||||||
def findById(cid: Ident): ConnectionIO[Option[RCollective]] = {
|
def findById(cid: Ident): ConnectionIO[Option[RCollective]] = {
|
||||||
val sql = selectSimple(all, table, id.is(cid))
|
val sql = selectSimple(all, table, id.is(cid))
|
||||||
@ -112,5 +145,10 @@ object RCollective {
|
|||||||
selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option
|
selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option
|
||||||
}
|
}
|
||||||
|
|
||||||
case class Settings(language: Language, integrationEnabled: Boolean)
|
case class Settings(
|
||||||
|
language: Language,
|
||||||
|
integrationEnabled: Boolean,
|
||||||
|
classifier: Option[RClassifierSetting.Classifier]
|
||||||
|
)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -88,6 +88,7 @@ module Api exposing
|
|||||||
, setItemNotes
|
, setItemNotes
|
||||||
, setTags
|
, setTags
|
||||||
, setUnconfirmed
|
, setUnconfirmed
|
||||||
|
, startClassifier
|
||||||
, startOnceNotifyDueItems
|
, startOnceNotifyDueItems
|
||||||
, startOnceScanMailbox
|
, startOnceScanMailbox
|
||||||
, startReIndex
|
, startReIndex
|
||||||
@ -795,6 +796,19 @@ versionInfo flags receive =
|
|||||||
--- Collective
|
--- Collective
|
||||||
|
|
||||||
|
|
||||||
|
startClassifier :
|
||||||
|
Flags
|
||||||
|
-> (Result Http.Error BasicResult -> msg)
|
||||||
|
-> Cmd msg
|
||||||
|
startClassifier flags receive =
|
||||||
|
Http2.authPost
|
||||||
|
{ url = flags.config.baseUrl ++ "/api/v1/sec/collective/classifier/startonce"
|
||||||
|
, account = getAccount flags
|
||||||
|
, body = Http.emptyBody
|
||||||
|
, expect = Http.expectJson receive Api.Model.BasicResult.decoder
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
getTagCloud : Flags -> (Result Http.Error TagCloud -> msg) -> Cmd msg
|
getTagCloud : Flags -> (Result Http.Error TagCloud -> msg) -> Cmd msg
|
||||||
getTagCloud flags receive =
|
getTagCloud flags receive =
|
||||||
Http2.authGet
|
Http2.authGet
|
||||||
|
@ -218,12 +218,12 @@ loginInfo model =
|
|||||||
, menuEntry model
|
, menuEntry model
|
||||||
CollectiveSettingPage
|
CollectiveSettingPage
|
||||||
[ i [ class "users circle icon" ] []
|
[ i [ class "users circle icon" ] []
|
||||||
, text "Collective Settings"
|
, text "Collective Profile"
|
||||||
]
|
]
|
||||||
, menuEntry model
|
, menuEntry model
|
||||||
UserSettingPage
|
UserSettingPage
|
||||||
[ i [ class "user circle icon" ] []
|
[ i [ class "user circle icon" ] []
|
||||||
, text "User Settings"
|
, text "User Profile"
|
||||||
]
|
]
|
||||||
, div [ class "divider" ] []
|
, div [ class "divider" ] []
|
||||||
, menuEntry model
|
, menuEntry model
|
||||||
|
204
modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
Normal file
204
modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
module Comp.ClassifierSettingsForm exposing
|
||||||
|
( Model
|
||||||
|
, Msg
|
||||||
|
, getSettings
|
||||||
|
, init
|
||||||
|
, update
|
||||||
|
, view
|
||||||
|
)
|
||||||
|
|
||||||
|
import Api
|
||||||
|
import Api.Model.ClassifierSetting exposing (ClassifierSetting)
|
||||||
|
import Api.Model.TagList exposing (TagList)
|
||||||
|
import Comp.CalEventInput
|
||||||
|
import Comp.FixedDropdown
|
||||||
|
import Comp.IntField
|
||||||
|
import Data.CalEvent exposing (CalEvent)
|
||||||
|
import Data.Flags exposing (Flags)
|
||||||
|
import Data.Validated exposing (Validated(..))
|
||||||
|
import Html exposing (..)
|
||||||
|
import Html.Attributes exposing (..)
|
||||||
|
import Html.Events exposing (onCheck)
|
||||||
|
import Http
|
||||||
|
import Util.Tag
|
||||||
|
|
||||||
|
|
||||||
|
type alias Model =
|
||||||
|
{ enabled : Bool
|
||||||
|
, categoryModel : Comp.FixedDropdown.Model String
|
||||||
|
, category : Maybe String
|
||||||
|
, scheduleModel : Comp.CalEventInput.Model
|
||||||
|
, schedule : Validated CalEvent
|
||||||
|
, itemCountModel : Comp.IntField.Model
|
||||||
|
, itemCount : Maybe Int
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
type Msg
|
||||||
|
= GetTagsResp (Result Http.Error TagList)
|
||||||
|
| ScheduleMsg Comp.CalEventInput.Msg
|
||||||
|
| ToggleEnabled
|
||||||
|
| CategoryMsg (Comp.FixedDropdown.Msg String)
|
||||||
|
| ItemCountMsg Comp.IntField.Msg
|
||||||
|
|
||||||
|
|
||||||
|
init : Flags -> ClassifierSetting -> ( Model, Cmd Msg )
|
||||||
|
init flags sett =
|
||||||
|
let
|
||||||
|
newSchedule =
|
||||||
|
Data.CalEvent.fromEvent sett.schedule
|
||||||
|
|> Maybe.withDefault Data.CalEvent.everyMonth
|
||||||
|
|
||||||
|
( cem, cec ) =
|
||||||
|
Comp.CalEventInput.init flags newSchedule
|
||||||
|
in
|
||||||
|
( { enabled = sett.enabled
|
||||||
|
, categoryModel = Comp.FixedDropdown.initString []
|
||||||
|
, category = sett.category
|
||||||
|
, scheduleModel = cem
|
||||||
|
, schedule = Data.Validated.Unknown newSchedule
|
||||||
|
, itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
|
||||||
|
, itemCount = Just sett.itemCount
|
||||||
|
}
|
||||||
|
, Cmd.batch
|
||||||
|
[ Api.getTags flags "" GetTagsResp
|
||||||
|
, Cmd.map ScheduleMsg cec
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
getSettings : Model -> Validated ClassifierSetting
|
||||||
|
getSettings model =
|
||||||
|
Data.Validated.map
|
||||||
|
(\sch ->
|
||||||
|
{ enabled = model.enabled
|
||||||
|
, category = model.category
|
||||||
|
, schedule =
|
||||||
|
Data.CalEvent.makeEvent sch
|
||||||
|
, itemCount = Maybe.withDefault 0 model.itemCount
|
||||||
|
}
|
||||||
|
)
|
||||||
|
model.schedule
|
||||||
|
|
||||||
|
|
||||||
|
update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
|
||||||
|
update flags msg model =
|
||||||
|
case msg of
|
||||||
|
GetTagsResp (Ok tl) ->
|
||||||
|
let
|
||||||
|
categories =
|
||||||
|
Util.Tag.getCategories tl.items
|
||||||
|
|> List.sort
|
||||||
|
in
|
||||||
|
( { model
|
||||||
|
| categoryModel = Comp.FixedDropdown.initString categories
|
||||||
|
, category =
|
||||||
|
if model.category == Nothing then
|
||||||
|
List.head categories
|
||||||
|
|
||||||
|
else
|
||||||
|
model.category
|
||||||
|
}
|
||||||
|
, Cmd.none
|
||||||
|
)
|
||||||
|
|
||||||
|
GetTagsResp (Err _) ->
|
||||||
|
( model, Cmd.none )
|
||||||
|
|
||||||
|
ScheduleMsg lmsg ->
|
||||||
|
let
|
||||||
|
( cm, cc, ce ) =
|
||||||
|
Comp.CalEventInput.update
|
||||||
|
flags
|
||||||
|
(Data.Validated.value model.schedule)
|
||||||
|
lmsg
|
||||||
|
model.scheduleModel
|
||||||
|
in
|
||||||
|
( { model
|
||||||
|
| scheduleModel = cm
|
||||||
|
, schedule = ce
|
||||||
|
}
|
||||||
|
, Cmd.map ScheduleMsg cc
|
||||||
|
)
|
||||||
|
|
||||||
|
ToggleEnabled ->
|
||||||
|
( { model | enabled = not model.enabled }
|
||||||
|
, Cmd.none
|
||||||
|
)
|
||||||
|
|
||||||
|
CategoryMsg lmsg ->
|
||||||
|
let
|
||||||
|
( mm, ma ) =
|
||||||
|
Comp.FixedDropdown.update lmsg model.categoryModel
|
||||||
|
in
|
||||||
|
( { model
|
||||||
|
| categoryModel = mm
|
||||||
|
, category =
|
||||||
|
if ma == Nothing then
|
||||||
|
model.category
|
||||||
|
|
||||||
|
else
|
||||||
|
ma
|
||||||
|
}
|
||||||
|
, Cmd.none
|
||||||
|
)
|
||||||
|
|
||||||
|
ItemCountMsg lmsg ->
|
||||||
|
let
|
||||||
|
( im, iv ) =
|
||||||
|
Comp.IntField.update lmsg model.itemCountModel
|
||||||
|
in
|
||||||
|
( { model
|
||||||
|
| itemCountModel = im
|
||||||
|
, itemCount = iv
|
||||||
|
}
|
||||||
|
, Cmd.none
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
view : Model -> Html Msg
|
||||||
|
view model =
|
||||||
|
div []
|
||||||
|
[ div
|
||||||
|
[ class "field"
|
||||||
|
]
|
||||||
|
[ div [ class "ui checkbox" ]
|
||||||
|
[ input
|
||||||
|
[ type_ "checkbox"
|
||||||
|
, onCheck (\_ -> ToggleEnabled)
|
||||||
|
, checked model.enabled
|
||||||
|
]
|
||||||
|
[]
|
||||||
|
, label [] [ text "Enable classification" ]
|
||||||
|
, span [ class "small-info" ]
|
||||||
|
[ text "Disable document classification if not needed."
|
||||||
|
]
|
||||||
|
]
|
||||||
|
]
|
||||||
|
, div [ class "ui basic segment" ]
|
||||||
|
[ text "Document classification tries to predict a tag for new incoming documents. This "
|
||||||
|
, text "works by learning from existing documents in order to find common patterns within "
|
||||||
|
, text "the text. The more documents you have correctly tagged, the better. Learning is done "
|
||||||
|
, text "periodically based on a schedule and you need to specify a tag-group that should "
|
||||||
|
, text "be used for learning."
|
||||||
|
]
|
||||||
|
, div [ class "field" ]
|
||||||
|
[ label [] [ text "Category" ]
|
||||||
|
, Html.map CategoryMsg
|
||||||
|
(Comp.FixedDropdown.viewString model.category
|
||||||
|
model.categoryModel
|
||||||
|
)
|
||||||
|
]
|
||||||
|
, Html.map ItemCountMsg
|
||||||
|
(Comp.IntField.viewWithInfo
|
||||||
|
"The maximum number of items to learn from, order by date newest first. Use 0 to mean all."
|
||||||
|
model.itemCount
|
||||||
|
"field"
|
||||||
|
model.itemCountModel
|
||||||
|
)
|
||||||
|
, div [ class "field" ]
|
||||||
|
[ label [] [ text "Schedule" ]
|
||||||
|
, Html.map ScheduleMsg
|
||||||
|
(Comp.CalEventInput.view "" (Data.Validated.value model.schedule) model.scheduleModel)
|
||||||
|
]
|
||||||
|
]
|
@ -10,10 +10,12 @@ module Comp.CollectiveSettingsForm exposing
|
|||||||
import Api
|
import Api
|
||||||
import Api.Model.BasicResult exposing (BasicResult)
|
import Api.Model.BasicResult exposing (BasicResult)
|
||||||
import Api.Model.CollectiveSettings exposing (CollectiveSettings)
|
import Api.Model.CollectiveSettings exposing (CollectiveSettings)
|
||||||
|
import Comp.ClassifierSettingsForm
|
||||||
import Comp.Dropdown
|
import Comp.Dropdown
|
||||||
import Data.Flags exposing (Flags)
|
import Data.Flags exposing (Flags)
|
||||||
import Data.Language exposing (Language)
|
import Data.Language exposing (Language)
|
||||||
import Data.UiSettings exposing (UiSettings)
|
import Data.UiSettings exposing (UiSettings)
|
||||||
|
import Data.Validated exposing (Validated)
|
||||||
import Html exposing (..)
|
import Html exposing (..)
|
||||||
import Html.Attributes exposing (..)
|
import Html.Attributes exposing (..)
|
||||||
import Html.Events exposing (onCheck, onClick, onInput)
|
import Html.Events exposing (onCheck, onClick, onInput)
|
||||||
@ -27,44 +29,60 @@ type alias Model =
|
|||||||
, initSettings : CollectiveSettings
|
, initSettings : CollectiveSettings
|
||||||
, fullTextConfirmText : String
|
, fullTextConfirmText : String
|
||||||
, fullTextReIndexResult : Maybe BasicResult
|
, fullTextReIndexResult : Maybe BasicResult
|
||||||
|
, classifierModel : Comp.ClassifierSettingsForm.Model
|
||||||
|
, startClassifierResult : Maybe BasicResult
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
init : CollectiveSettings -> Model
|
init : Flags -> CollectiveSettings -> ( Model, Cmd Msg )
|
||||||
init settings =
|
init flags settings =
|
||||||
let
|
let
|
||||||
lang =
|
lang =
|
||||||
Data.Language.fromString settings.language
|
Data.Language.fromString settings.language
|
||||||
|> Maybe.withDefault Data.Language.German
|
|> Maybe.withDefault Data.Language.German
|
||||||
|
|
||||||
|
( cm, cc ) =
|
||||||
|
Comp.ClassifierSettingsForm.init flags settings.classifier
|
||||||
in
|
in
|
||||||
{ langModel =
|
( { langModel =
|
||||||
Comp.Dropdown.makeSingleList
|
Comp.Dropdown.makeSingleList
|
||||||
{ makeOption =
|
{ makeOption =
|
||||||
\l ->
|
\l ->
|
||||||
{ value = Data.Language.toIso3 l
|
{ value = Data.Language.toIso3 l
|
||||||
, text = Data.Language.toName l
|
, text = Data.Language.toName l
|
||||||
, additional = ""
|
, additional = ""
|
||||||
}
|
}
|
||||||
, placeholder = ""
|
, placeholder = ""
|
||||||
, options = Data.Language.all
|
, options = Data.Language.all
|
||||||
, selected = Just lang
|
, selected = Just lang
|
||||||
}
|
}
|
||||||
, intEnabled = settings.integrationEnabled
|
, intEnabled = settings.integrationEnabled
|
||||||
, initSettings = settings
|
, initSettings = settings
|
||||||
, fullTextConfirmText = ""
|
, fullTextConfirmText = ""
|
||||||
, fullTextReIndexResult = Nothing
|
, fullTextReIndexResult = Nothing
|
||||||
}
|
, classifierModel = cm
|
||||||
|
, startClassifierResult = Nothing
|
||||||
|
}
|
||||||
|
, Cmd.map ClassifierSettingMsg cc
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
getSettings : Model -> CollectiveSettings
|
getSettings : Model -> Validated CollectiveSettings
|
||||||
getSettings model =
|
getSettings model =
|
||||||
CollectiveSettings
|
Data.Validated.map
|
||||||
(Comp.Dropdown.getSelected model.langModel
|
(\cls ->
|
||||||
|> List.head
|
{ language =
|
||||||
|> Maybe.map Data.Language.toIso3
|
Comp.Dropdown.getSelected model.langModel
|
||||||
|> Maybe.withDefault model.initSettings.language
|
|> List.head
|
||||||
|
|> Maybe.map Data.Language.toIso3
|
||||||
|
|> Maybe.withDefault model.initSettings.language
|
||||||
|
, integrationEnabled = model.intEnabled
|
||||||
|
, classifier = cls
|
||||||
|
}
|
||||||
|
)
|
||||||
|
(Comp.ClassifierSettingsForm.getSettings
|
||||||
|
model.classifierModel
|
||||||
)
|
)
|
||||||
model.intEnabled
|
|
||||||
|
|
||||||
|
|
||||||
type Msg
|
type Msg
|
||||||
@ -73,6 +91,10 @@ type Msg
|
|||||||
| SetFullTextConfirm String
|
| SetFullTextConfirm String
|
||||||
| TriggerReIndex
|
| TriggerReIndex
|
||||||
| TriggerReIndexResult (Result Http.Error BasicResult)
|
| TriggerReIndexResult (Result Http.Error BasicResult)
|
||||||
|
| ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg
|
||||||
|
| SaveSettings
|
||||||
|
| StartClassifierTask
|
||||||
|
| StartClassifierResp (Result Http.Error BasicResult)
|
||||||
|
|
||||||
|
|
||||||
update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings )
|
update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings )
|
||||||
@ -85,22 +107,15 @@ update flags msg model =
|
|||||||
|
|
||||||
nextModel =
|
nextModel =
|
||||||
{ model | langModel = m2 }
|
{ model | langModel = m2 }
|
||||||
|
|
||||||
nextSettings =
|
|
||||||
if Comp.Dropdown.isDropdownChangeMsg m then
|
|
||||||
Just (getSettings nextModel)
|
|
||||||
|
|
||||||
else
|
|
||||||
Nothing
|
|
||||||
in
|
in
|
||||||
( nextModel, Cmd.map LangDropdownMsg c2, nextSettings )
|
( nextModel, Cmd.map LangDropdownMsg c2, Nothing )
|
||||||
|
|
||||||
ToggleIntegrationEndpoint ->
|
ToggleIntegrationEndpoint ->
|
||||||
let
|
let
|
||||||
nextModel =
|
nextModel =
|
||||||
{ model | intEnabled = not model.intEnabled }
|
{ model | intEnabled = not model.intEnabled }
|
||||||
in
|
in
|
||||||
( nextModel, Cmd.none, Just (getSettings nextModel) )
|
( nextModel, Cmd.none, Nothing )
|
||||||
|
|
||||||
SetFullTextConfirm str ->
|
SetFullTextConfirm str ->
|
||||||
( { model | fullTextConfirmText = str }, Cmd.none, Nothing )
|
( { model | fullTextConfirmText = str }, Cmd.none, Nothing )
|
||||||
@ -138,12 +153,50 @@ update flags msg model =
|
|||||||
, Nothing
|
, Nothing
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ClassifierSettingMsg lmsg ->
|
||||||
|
let
|
||||||
|
( cm, cc ) =
|
||||||
|
Comp.ClassifierSettingsForm.update flags lmsg model.classifierModel
|
||||||
|
in
|
||||||
|
( { model
|
||||||
|
| classifierModel = cm
|
||||||
|
}
|
||||||
|
, Cmd.map ClassifierSettingMsg cc
|
||||||
|
, Nothing
|
||||||
|
)
|
||||||
|
|
||||||
|
SaveSettings ->
|
||||||
|
case getSettings model of
|
||||||
|
Data.Validated.Valid s ->
|
||||||
|
( model, Cmd.none, Just s )
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
( model, Cmd.none, Nothing )
|
||||||
|
|
||||||
|
StartClassifierTask ->
|
||||||
|
( model, Api.startClassifier flags StartClassifierResp, Nothing )
|
||||||
|
|
||||||
|
StartClassifierResp (Ok br) ->
|
||||||
|
( { model | startClassifierResult = Just br }
|
||||||
|
, Cmd.none
|
||||||
|
, Nothing
|
||||||
|
)
|
||||||
|
|
||||||
|
StartClassifierResp (Err err) ->
|
||||||
|
( { model
|
||||||
|
| startClassifierResult =
|
||||||
|
Just (BasicResult False (Util.Http.errorToString err))
|
||||||
|
}
|
||||||
|
, Cmd.none
|
||||||
|
, Nothing
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
view : Flags -> UiSettings -> Model -> Html Msg
|
view : Flags -> UiSettings -> Model -> Html Msg
|
||||||
view flags settings model =
|
view flags settings model =
|
||||||
div
|
div
|
||||||
[ classList
|
[ classList
|
||||||
[ ( "ui form", True )
|
[ ( "ui form error success", True )
|
||||||
, ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
|
, ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
|
||||||
, ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
|
, ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
|
||||||
]
|
]
|
||||||
@ -219,17 +272,62 @@ view flags settings model =
|
|||||||
[ text "This starts a task that clears the full-text index and re-indexes all your data again."
|
[ text "This starts a task that clears the full-text index and re-indexes all your data again."
|
||||||
, text "You must type OK before clicking the button to avoid accidental re-indexing."
|
, text "You must type OK before clicking the button to avoid accidental re-indexing."
|
||||||
]
|
]
|
||||||
, div
|
, renderResultMessage model.fullTextReIndexResult
|
||||||
[ classList
|
]
|
||||||
[ ( "ui message", True )
|
, h3
|
||||||
, ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
|
[ classList
|
||||||
, ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
|
[ ( "ui dividing header", True )
|
||||||
, ( "hidden invisible", model.fullTextReIndexResult == Nothing )
|
, ( "invisible hidden", False )
|
||||||
]
|
|
||||||
]
|
|
||||||
[ Maybe.map .message model.fullTextReIndexResult
|
|
||||||
|> Maybe.withDefault ""
|
|
||||||
|> text
|
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
|
[ text "Document Classifier"
|
||||||
|
]
|
||||||
|
, div
|
||||||
|
[ classList
|
||||||
|
[ ( "field", True )
|
||||||
|
, ( "invisible hidden", False )
|
||||||
|
]
|
||||||
|
]
|
||||||
|
[ Html.map ClassifierSettingMsg
|
||||||
|
(Comp.ClassifierSettingsForm.view model.classifierModel)
|
||||||
|
, div [ class "ui vertical segment" ]
|
||||||
|
[ button
|
||||||
|
[ classList
|
||||||
|
[ ( "ui small secondary basic button", True )
|
||||||
|
, ( "disabled", not model.classifierModel.enabled )
|
||||||
|
]
|
||||||
|
, title "Starts a task to train a classifier"
|
||||||
|
, onClick StartClassifierTask
|
||||||
|
]
|
||||||
|
[ text "Start now"
|
||||||
|
]
|
||||||
|
, renderResultMessage model.startClassifierResult
|
||||||
|
]
|
||||||
|
]
|
||||||
|
, div [ class "ui divider" ] []
|
||||||
|
, button
|
||||||
|
[ classList
|
||||||
|
[ ( "ui primary button", True )
|
||||||
|
, ( "disabled", getSettings model |> Data.Validated.isInvalid )
|
||||||
|
]
|
||||||
|
, onClick SaveSettings
|
||||||
|
]
|
||||||
|
[ text "Save"
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
renderResultMessage : Maybe BasicResult -> Html msg
|
||||||
|
renderResultMessage result =
|
||||||
|
div
|
||||||
|
[ classList
|
||||||
|
[ ( "ui message", True )
|
||||||
|
, ( "error", Maybe.map .success result == Just False )
|
||||||
|
, ( "success", Maybe.map .success result == Just True )
|
||||||
|
, ( "hidden invisible", result == Nothing )
|
||||||
|
]
|
||||||
|
]
|
||||||
|
[ Maybe.map .message result
|
||||||
|
|> Maybe.withDefault ""
|
||||||
|
|> text
|
||||||
]
|
]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
module Data.Validated exposing
|
module Data.Validated exposing
|
||||||
( Validated(..)
|
( Validated(..)
|
||||||
|
, isInvalid
|
||||||
, map
|
, map
|
||||||
, map2
|
, map2
|
||||||
, map3
|
, map3
|
||||||
@ -14,6 +15,19 @@ type Validated a
|
|||||||
| Unknown a
|
| Unknown a
|
||||||
|
|
||||||
|
|
||||||
|
isInvalid : Validated a -> Bool
|
||||||
|
isInvalid v =
|
||||||
|
case v of
|
||||||
|
Valid _ ->
|
||||||
|
False
|
||||||
|
|
||||||
|
Invalid _ _ ->
|
||||||
|
True
|
||||||
|
|
||||||
|
Unknown _ ->
|
||||||
|
False
|
||||||
|
|
||||||
|
|
||||||
value : Validated a -> a
|
value : Validated a -> a
|
||||||
value va =
|
value va =
|
||||||
case va of
|
case va of
|
||||||
|
@ -30,15 +30,21 @@ init flags =
|
|||||||
let
|
let
|
||||||
( sm, sc ) =
|
( sm, sc ) =
|
||||||
Comp.SourceManage.init flags
|
Comp.SourceManage.init flags
|
||||||
|
|
||||||
|
( cm, cc ) =
|
||||||
|
Comp.CollectiveSettingsForm.init flags Api.Model.CollectiveSettings.empty
|
||||||
in
|
in
|
||||||
( { currentTab = Just InsightsTab
|
( { currentTab = Just InsightsTab
|
||||||
, sourceModel = sm
|
, sourceModel = sm
|
||||||
, userModel = Comp.UserManage.emptyModel
|
, userModel = Comp.UserManage.emptyModel
|
||||||
, settingsModel = Comp.CollectiveSettingsForm.init Api.Model.CollectiveSettings.empty
|
, settingsModel = cm
|
||||||
, insights = Api.Model.ItemInsights.empty
|
, insights = Api.Model.ItemInsights.empty
|
||||||
, submitResult = Nothing
|
, submitResult = Nothing
|
||||||
}
|
}
|
||||||
, Cmd.map SourceMsg sc
|
, Cmd.batch
|
||||||
|
[ Cmd.map SourceMsg sc
|
||||||
|
, Cmd.map SettingsFormMsg cc
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,7 +77,13 @@ update flags msg model =
|
|||||||
( model, Cmd.none )
|
( model, Cmd.none )
|
||||||
|
|
||||||
CollectiveSettingsResp (Ok data) ->
|
CollectiveSettingsResp (Ok data) ->
|
||||||
( { model | settingsModel = Comp.CollectiveSettingsForm.init data }, Cmd.none )
|
let
|
||||||
|
( cm, cc ) =
|
||||||
|
Comp.CollectiveSettingsForm.init flags data
|
||||||
|
in
|
||||||
|
( { model | settingsModel = cm }
|
||||||
|
, Cmd.map SettingsFormMsg cc
|
||||||
|
)
|
||||||
|
|
||||||
CollectiveSettingsResp (Err _) ->
|
CollectiveSettingsResp (Err _) ->
|
||||||
( model, Cmd.none )
|
( model, Cmd.none )
|
||||||
|
@ -185,10 +185,11 @@ viewSettings : Flags -> UiSettings -> Model -> List (Html Msg)
|
|||||||
viewSettings flags settings model =
|
viewSettings flags settings model =
|
||||||
[ h2 [ class "ui header" ]
|
[ h2 [ class "ui header" ]
|
||||||
[ i [ class "cog icon" ] []
|
[ i [ class "cog icon" ] []
|
||||||
, text "Settings"
|
, text "Collective Settings"
|
||||||
]
|
]
|
||||||
, div [ class "ui segment" ]
|
, div [ class "ui segment" ]
|
||||||
[ Html.map SettingsFormMsg (Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
|
[ Html.map SettingsFormMsg
|
||||||
|
(Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
|
||||||
]
|
]
|
||||||
, div
|
, div
|
||||||
[ classList
|
[ classList
|
||||||
|
@ -95,6 +95,21 @@ let
|
|||||||
enabled = true;
|
enabled = true;
|
||||||
file-cache-time = "1 minute";
|
file-cache-time = "1 minute";
|
||||||
};
|
};
|
||||||
|
classification = {
|
||||||
|
enabled = true;
|
||||||
|
item-count = 0;
|
||||||
|
classifiers = [
|
||||||
|
{ "useSplitWords" = "true";
|
||||||
|
"splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.'';
|
||||||
|
"splitWordsIgnoreRegexp" = ''\s+'';
|
||||||
|
"useSplitPrefixSuffixNGrams" = "true";
|
||||||
|
"maxNGramLeng" = "4";
|
||||||
|
"minNGramLeng" = "1";
|
||||||
|
"splitWordShape" = "chris4";
|
||||||
|
"intern" = "true";
|
||||||
|
}
|
||||||
|
];
|
||||||
|
};
|
||||||
working-dir = "/tmp/docspell-analysis";
|
working-dir = "/tmp/docspell-analysis";
|
||||||
};
|
};
|
||||||
processing = {
|
processing = {
|
||||||
@ -736,6 +751,59 @@ in {
|
|||||||
default = defaults.text-analysis.regex-ner;
|
default = defaults.text-analysis.regex-ner;
|
||||||
description = "";
|
description = "";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
classification = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
enabled = mkOption {
|
||||||
|
type = types.bool;
|
||||||
|
default = defaults.text-analysis.classification.enabled;
|
||||||
|
description = ''
|
||||||
|
Whether to enable classification globally. Each collective can
|
||||||
|
decide to disable it. If it is disabled here, no collective
|
||||||
|
can use classification.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
item-count = mkOption {
|
||||||
|
type = types.int;
|
||||||
|
default = defaults.text-analysis.classification.item-count;
|
||||||
|
description = ''
|
||||||
|
If concerned with memory consumption, this restricts the
|
||||||
|
number of items to consider. More are better for training. A
|
||||||
|
negative value or zero means no train on all items.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
classifiers = mkOption {
|
||||||
|
type = types.listOf types.attrs;
|
||||||
|
default = defaults.text-analysis.classification.classifiers;
|
||||||
|
description = ''
|
||||||
|
These settings are used to configure the classifier. If
|
||||||
|
multiple are given, they are all tried and the "best" is
|
||||||
|
chosen at the end. See
|
||||||
|
https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
|
||||||
|
for more info about these settings. The settings here yielded
|
||||||
|
good results with *my* dataset.
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.text-analysis.classification;
|
||||||
|
description = ''
|
||||||
|
Settings for doing document classification.
|
||||||
|
|
||||||
|
This works by learning from existing documents. A collective can
|
||||||
|
specify a tag category and the system will try to predict a tag
|
||||||
|
from this category for new incoming documents.
|
||||||
|
|
||||||
|
This requires a satstical model that is computed from all
|
||||||
|
existing documents. This process is run periodically as
|
||||||
|
configured by the collective. It may require a lot of memory,
|
||||||
|
depending on the amount of data.
|
||||||
|
|
||||||
|
It utilises this NLP library: https://nlp.stanford.edu/.
|
||||||
|
'';
|
||||||
|
};
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
default = defaults.text-analysis;
|
default = defaults.text-analysis;
|
||||||
|
@ -67,7 +67,7 @@ Text is extracted from all files. For scanned documents/images, OCR is used by u
|
|||||||
, { image = "img/analyze-feature.png"
|
, { image = "img/analyze-feature.png"
|
||||||
, header = "Text Analysis"
|
, header = "Text Analysis"
|
||||||
, description = """
|
, description = """
|
||||||
The extracted text is analyzed and is used to find properties that can be annotated to your documents automatically.
|
The extracted text is analyzed using ML techniques to find properties that can be annotated to your documents automatically.
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
, { image = "img/filetype-feature.svg"
|
, { image = "img/filetype-feature.svg"
|
||||||
|
@ -33,11 +33,26 @@ workflows, a tag category *state* may exist that includes tags like
|
|||||||
"assignment" semantics. Docspell doesn't propose any workflow, but it
|
"assignment" semantics. Docspell doesn't propose any workflow, but it
|
||||||
can help to implement some.
|
can help to implement some.
|
||||||
|
|
||||||
The tags are *not* taken into account when creating suggestions from
|
Docspell can try to predict a tag for new incoming documents
|
||||||
analyzed text yet. However, PDF files may contain metadata itself and
|
automatically based on your existing data. This requires to train an
|
||||||
if there is a metadata *keywords* list, these keywords are matched
|
algorithm. There are some caveats: the more data you have correctly
|
||||||
against the tags in the database. If they match, the item is tagged
|
tagged, the better are the results. So it won't work well for maybe
|
||||||
automatically.
|
the first 100 documents. Then the tags must somehow relate to a
|
||||||
|
pattern in the document text. Tags like *todo* or *waiting* probably
|
||||||
|
won't work, obviously. But the typical "document type" tag, like
|
||||||
|
*invoice* and *receipt* is a good fit! That is why you need to provide
|
||||||
|
a tag category so only sensible tags are being learned. The algorithm
|
||||||
|
goes through all your items and learns patterns in the text that
|
||||||
|
relate to the given tags. This training step can be run periodically,
|
||||||
|
as specified in your collective settings such that docspell keeps
|
||||||
|
learning from your already tagged data! More information about the
|
||||||
|
algorithm can be found in the config, where it is possible to
|
||||||
|
fine-tune this process.
|
||||||
|
|
||||||
|
Another way to have items tagged automatically is when an input PDF
|
||||||
|
file contains a list of keywords in its metadata section (this only
|
||||||
|
applies to PDF files). These keywords are then matched against the
|
||||||
|
tags in the database. If they match, the item is tagged with them.
|
||||||
|
|
||||||
|
|
||||||
## Organization and Person
|
## Organization and Person
|
||||||
|
Loading…
x
Reference in New Issue
Block a user