Merge pull request #242 from eikek/classifier

Classifier
This commit is contained in:
mergify[bot] 2020-09-02 20:50:13 +00:00 committed by GitHub
commit 8cb78e3dbe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 1458 additions and 102 deletions

View File

@ -7,18 +7,21 @@ import docspell.analysis.contact.Contact
import docspell.analysis.date.DateFind
import docspell.analysis.nlp.PipelineCache
import docspell.analysis.nlp.StanfordNerClassifier
import docspell.analysis.nlp.StanfordSettings
import docspell.analysis.nlp.StanfordNerSettings
import docspell.analysis.nlp.StanfordTextClassifier
import docspell.analysis.nlp.TextClassifier
import docspell.common._
trait TextAnalyser[F[_]] {
def annotate(
logger: Logger[F],
settings: StanfordSettings,
settings: StanfordNerSettings,
cacheKey: Ident,
text: String
): F[TextAnalyser.Result]
def classifier(blocker: Blocker)(implicit CS: ContextShift[F]): TextClassifier[F]
}
object TextAnalyser {
@ -35,7 +38,7 @@ object TextAnalyser {
new TextAnalyser[F] {
def annotate(
logger: Logger[F],
settings: StanfordSettings,
settings: StanfordNerSettings,
cacheKey: Ident,
text: String
): F[TextAnalyser.Result] =
@ -48,6 +51,11 @@ object TextAnalyser {
spans = NerLabelSpan.build(list)
} yield Result(spans ++ list, dates)
def classifier(blocker: Blocker)(implicit
CS: ContextShift[F]
): TextClassifier[F] =
new StanfordTextClassifier[F](cfg.classifier, blocker)
private def textLimit(logger: Logger[F], text: String): F[String] =
if (text.length <= cfg.maxLength) text.pure[F]
else
@ -56,7 +64,7 @@ object TextAnalyser {
s" Analysing only first ${cfg.maxLength} characters."
) *> text.take(cfg.maxLength).pure[F]
private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
: F[Vector[NerLabel]] =
StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)

View File

@ -1,5 +1,8 @@
package docspell.analysis
import docspell.analysis.nlp.TextClassifierConfig
case class TextAnalysisConfig(
maxLength: Int
maxLength: Int,
classifier: TextClassifierConfig
)

View File

@ -0,0 +1,5 @@
package docspell.analysis.nlp
import java.nio.file.Path
case class ClassifierModel(model: Path)

View File

@ -19,7 +19,7 @@ import org.log4s.getLogger
*/
trait PipelineCache[F[_]] {
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP]
}
@ -28,7 +28,7 @@ object PipelineCache {
def none[F[_]: Applicative]: PipelineCache[F] =
new PipelineCache[F] {
def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
def obtain(ignored: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
makeClassifier(settings).pure[F]
}
@ -38,7 +38,7 @@ object PipelineCache {
final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
extends PipelineCache[F] {
def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
for {
id <- makeSettingsId(settings)
nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
@ -48,7 +48,7 @@ object PipelineCache {
key: String,
id: String,
cache: Map[String, Entry],
settings: StanfordSettings
settings: StanfordNerSettings
): (Map[String, Entry], StanfordCoreNLP) =
cache.get(key) match {
case Some(entry) =>
@ -68,7 +68,7 @@ object PipelineCache {
(cache.updated(key, e), nlp)
}
private def makeSettingsId(settings: StanfordSettings): F[String] = {
private def makeSettingsId(settings: StanfordNerSettings): F[String] = {
val base = settings.copy(regexNer = None).toString
val size: F[Long] =
settings.regexNer match {
@ -81,7 +81,7 @@ object PipelineCache {
}
}
private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = {
logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
new StanfordCoreNLP(Properties.forSettings(settings))
}

View File

@ -7,6 +7,9 @@ import docspell.common._
object Properties {
def fromMap(m: Map[String, String]): JProps =
apply(m.toSeq: _*)
def apply(ps: (String, String)*): JProps = {
val p = new JProps()
for ((k, v) <- ps)
@ -14,7 +17,7 @@ object Properties {
p
}
def forSettings(settings: StanfordSettings): JProps = {
def forSettings(settings: StanfordNerSettings): JProps = {
val regexNerFile = settings.regexNer
.map(p => p.normalize().toAbsolutePath().toString())
settings.lang match {

View File

@ -25,7 +25,7 @@ object StanfordNerClassifier {
def nerAnnotate[F[_]: Applicative](
cacheKey: String,
cache: PipelineCache[F]
)(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
)(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
cache
.obtain(cacheKey, settings)
.map(crf => runClassifier(crf, text))

View File

@ -19,4 +19,8 @@ import docspell.common._
* as a last step to tag untagged tokens using the provided list of
* regexps.
*/
case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
case class StanfordNerSettings(
lang: Language,
highRecall: Boolean,
regexNer: Option[Path]
)

View File

@ -0,0 +1,153 @@
package docspell.analysis.nlp
import java.nio.file.Path
import cats.effect._
import cats.effect.concurrent.Ref
import cats.implicits._
import fs2.Stream
import docspell.analysis.nlp.TextClassifier._
import docspell.common._
import edu.stanford.nlp.classify.ColumnDataClassifier
final class StanfordTextClassifier[F[_]: Sync: ContextShift](
cfg: TextClassifierConfig,
blocker: Blocker
) extends TextClassifier[F] {
def trainClassifier[A](
logger: Logger[F],
data: Stream[F, Data]
)(handler: TextClassifier.Handler[F, A]): F[A] =
File
.withTempDir(cfg.workingDir, "trainclassifier")
.use { dir =>
for {
rawData <- writeDataFile(blocker, dir, data)
_ <- logger.info(s"Learning from ${rawData.count} items.")
trainData <- splitData(logger, rawData)
scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
sorted = scores.sortBy(-_.score)
res <- handler(sorted.head.model)
} yield res
}
def classify(
logger: Logger[F],
model: ClassifierModel,
text: String
): F[Option[String]] =
Sync[F].delay {
val cls = ColumnDataClassifier.getClassifier(
model.model.normalize().toAbsolutePath().toString()
)
val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
Option(cat)
}
// --- helpers
def train(
logger: Logger[F],
in: TrainData,
props: Map[String, String]
): F[TrainResult] =
for {
_ <- logger.debug(s"Training classifier from $props")
res <- Sync[F].delay {
val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
cdc.trainClassifier(in.train.toString())
val score = cdc.testClassifier(in.test.toString())
TrainResult(score.first(), ClassifierModel(in.modelFile))
}
_ <- logger.debug(s"Trained with result $res")
} yield res
def splitData(logger: Logger[F], in: RawData): F[TrainData] = {
val nTest = (in.count * 0.15).toLong
val td =
TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt"))
val fileLines =
fs2.io.file
.readAll(in.file, blocker, 4096)
.through(fs2.text.utf8Decode)
.through(fs2.text.lines)
for {
_ <- logger.debug(
s"Splitting raw data into test/train data. Testing with $nTest entries"
)
_ <-
fileLines
.take(nTest)
.intersperse("\n")
.through(fs2.text.utf8Encode)
.through(fs2.io.file.writeAll(td.test, blocker))
.compile
.drain
_ <-
fileLines
.drop(nTest)
.intersperse("\n")
.through(fs2.text.utf8Encode)
.through(fs2.io.file.writeAll(td.train, blocker))
.compile
.drain
} yield td
}
def writeDataFile(blocker: Blocker, dir: Path, data: Stream[F, Data]): F[RawData] = {
val target = dir.resolve("rawdata")
for {
counter <- Ref.of[F, Long](0L)
_ <-
data
.filter(_.text.nonEmpty)
.map(d => s"${d.cls}\t${fixRef(d.ref)}\t${normalisedText(d.text)}")
.evalTap(_ => counter.update(_ + 1))
.intersperse("\r\n")
.through(fs2.text.utf8Encode)
.through(fs2.io.file.writeAll(target, blocker))
.compile
.drain
lines <- counter.get
} yield RawData(lines, target)
}
def normalisedText(text: String): String =
text.replaceAll("[\n\r\t]+", " ")
def fixRef(str: String): String =
str.replace('\t', '_')
def amendProps(
trainData: TrainData,
props: Map[String, String]
): Map[String, String] =
prepend("2.", props) ++ Map(
"trainFile" -> trainData.train.normalize().toAbsolutePath().toString(),
"testFile" -> trainData.test.normalize().toAbsolutePath().toString(),
"serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString()
).toList
case class RawData(count: Long, file: Path)
case class TrainData(train: Path, test: Path) {
val modelFile = train.resolveSibling("model.ser.gz")
}
case class TrainResult(score: Double, model: ClassifierModel)
def prepend(pre: String, data: Map[String, String]): Map[String, String] =
data.toList
.map({
case (k, v) =>
if (k.startsWith(pre)) (k, v)
else (pre + k, v)
})
.toMap
}

View File

@ -0,0 +1,25 @@
package docspell.analysis.nlp
import cats.data.Kleisli
import fs2.Stream
import docspell.analysis.nlp.TextClassifier.Data
import docspell.common._
trait TextClassifier[F[_]] {
def trainClassifier[A](logger: Logger[F], data: Stream[F, Data])(
handler: TextClassifier.Handler[F, A]
): F[A]
def classify(logger: Logger[F], model: ClassifierModel, text: String): F[Option[String]]
}
object TextClassifier {
type Handler[F[_], A] = Kleisli[F, ClassifierModel, A]
case class Data(cls: String, ref: String, text: String)
}

View File

@ -0,0 +1,10 @@
package docspell.analysis.nlp
import java.nio.file.Path
import cats.data.NonEmptyList
case class TextClassifierConfig(
workingDir: Path,
classifierConfigs: NonEmptyList[Map[String, String]]
)

Binary file not shown.

View File

@ -0,0 +1,76 @@
package docspell.analysis.nlp
import minitest._
import cats.effect._
import scala.concurrent.ExecutionContext
import java.nio.file.Paths
import cats.data.NonEmptyList
import docspell.common._
import fs2.Stream
import cats.data.Kleisli
import TextClassifier.Data
object StanfordTextClassifierSuite extends SimpleTestSuite {
val logger = Logger.log4s[IO](org.log4s.getLogger)
implicit val CS = IO.contextShift(ExecutionContext.global)
test("learn from data") {
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
val data =
Stream
.emit(Data("invoice", "n", "this is your invoice total $421"))
.repeat
.take(10)
.zip(
Stream
.emit(Data("receipt", "n", "shopping receipt cheese cake bar"))
.repeat
.take(10)
)
.flatMap({
case (a, b) =>
Stream.emits(Seq(a, b))
})
.covary[IO]
val modelExists =
Blocker[IO].use { blocker =>
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
classifier.trainClassifier[Boolean](logger, data)(
Kleisli(result => File.existsNonEmpty[IO](result.model))
)
}
assertEquals(modelExists.unsafeRunSync(), true)
}
test("run classifier") {
val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
val things = for {
dir <- File.withTempDir[IO](Paths.get("target"), "testcls")
blocker <- Blocker[IO]
} yield (dir, blocker)
things
.use {
case (dir, blocker) =>
val classifier = new StanfordTextClassifier[IO](cfg, blocker)
val modelFile = dir.resolve("test.ser.gz")
for {
_ <-
LenientUri
.fromJava(getClass.getResource("/test.ser.gz"))
.readURL[IO](4096, blocker)
.through(fs2.io.file.writeAll(modelFile, blocker))
.compile
.drain
model = ClassifierModel(modelFile)
cat <- classifier.classify(logger, model, "there is receipt always")
_ = assertEquals(cat, Some("receipt"))
} yield ()
}
.unsafeRunSync()
}
}

View File

@ -52,12 +52,12 @@ object BackendApp {
queue <- JobQueue(store)
loginImpl <- Login[F](store)
signupImpl <- OSignup[F](store)
collImpl <- OCollective[F](store)
joexImpl <- OJoex(JoexClient(httpClient), store)
collImpl <- OCollective[F](store, utStore, queue, joexImpl)
sourceImpl <- OSource[F](store)
tagImpl <- OTag[F](store)
equipImpl <- OEquipment[F](store)
orgImpl <- OOrganization(store)
joexImpl <- OJoex(JoexClient(httpClient), store)
uploadImpl <- OUpload(store, queue, cfg.files, joexImpl)
nodeImpl <- ONode(store)
jobImpl <- OJob(store, joexImpl)

View File

@ -8,14 +8,21 @@ import docspell.backend.PasswordCrypt
import docspell.backend.ops.OCollective._
import docspell.common._
import docspell.store.queries.QCollective
import docspell.store.queue.JobQueue
import docspell.store.records._
import docspell.store.usertask.UserTask
import docspell.store.usertask.UserTaskStore
import docspell.store.{AddResult, Store}
import com.github.eikek.calev._
trait OCollective[F[_]] {
def find(name: Ident): F[Option[RCollective]]
def updateSettings(collective: Ident, lang: OCollective.Settings): F[AddResult]
def updateSettings(collective: Ident, settings: OCollective.Settings): F[AddResult]
def findSettings(collective: Ident): F[Option[OCollective.Settings]]
def listUser(collective: Ident): F[Vector[RUser]]
@ -43,6 +50,7 @@ trait OCollective[F[_]] {
def findEnabledSource(sourceId: Ident): F[Option[RSource]]
def startLearnClassifier(collective: Ident): F[Unit]
}
object OCollective {
@ -55,6 +63,8 @@ object OCollective {
type Settings = RCollective.Settings
val Settings = RCollective.Settings
type Classifier = RClassifierSetting.Classifier
val Classifier = RClassifierSetting.Classifier
sealed trait PassChangeResult
object PassChangeResult {
@ -91,7 +101,12 @@ object OCollective {
}
}
def apply[F[_]: Effect](store: Store[F]): Resource[F, OCollective[F]] =
def apply[F[_]: Effect](
store: Store[F],
uts: UserTaskStore[F],
queue: JobQueue[F],
joex: OJoex[F]
): Resource[F, OCollective[F]] =
Resource.pure[F, OCollective[F]](new OCollective[F] {
def find(name: Ident): F[Option[RCollective]] =
store.transact(RCollective.findById(name))
@ -101,6 +116,41 @@ object OCollective {
.transact(RCollective.updateSettings(collective, sett))
.attempt
.map(AddResult.fromUpdate)
.flatMap(res => updateLearnClassifierTask(collective, sett) *> res.pure[F])
def updateLearnClassifierTask(coll: Ident, sett: Settings) =
for {
id <- Ident.randomId[F]
on = sett.classifier.map(_.enabled).getOrElse(false)
timer = sett.classifier.map(_.schedule).getOrElse(CalEvent.unsafe(""))
ut = UserTask(
id,
LearnClassifierArgs.taskName,
on,
timer,
LearnClassifierArgs(coll)
)
_ <- uts.updateOneTask(AccountId(coll, LearnClassifierArgs.taskName), ut)
_ <- joex.notifyAllNodes
} yield ()
def startLearnClassifier(collective: Ident): F[Unit] =
for {
id <- Ident.randomId[F]
ut <- UserTask(
id,
LearnClassifierArgs.taskName,
true,
CalEvent(WeekdayComponent.All, DateEvent.All, TimeEvent.All),
LearnClassifierArgs(collective)
).encode.toPeriodicTask(AccountId(collective, LearnClassifierArgs.taskName))
job <- ut.toJob
_ <- queue.insert(job)
_ <- joex.notifyAllNodes
} yield ()
def findSettings(collective: Ident): F[Option[OCollective.Settings]] =
store.transact(RCollective.getSettings(collective))
def listUser(collective: Ident): F[Vector[RUser]] =
store.transact(RUser.findAll(collective, _.login))

View File

@ -0,0 +1,35 @@
package docspell.common
import docspell.common.syntax.all._
import io.circe._
import io.circe.generic.semiauto._
/** Arguments to the classify-item task.
*
* This task is run periodically and learns from existing documents
* to create a model for predicting tags of new documents. The user
* must give a tag category as a subset of possible tags..
*/
case class LearnClassifierArgs(
collective: Ident
) {
def makeSubject: String =
"Learn tags"
}
object LearnClassifierArgs {
val taskName = Ident.unsafe("learn-classifier")
implicit val jsonEncoder: Encoder[LearnClassifierArgs] =
deriveEncoder[LearnClassifierArgs]
implicit val jsonDecoder: Decoder[LearnClassifierArgs] =
deriveDecoder[LearnClassifierArgs]
def parse(str: String): Either[Throwable, LearnClassifierArgs] =
str.parseJsonAs[LearnClassifierArgs]
}

View File

@ -271,6 +271,50 @@ docspell.joex {
# file will be kept until a check for a state change is done.
file-cache-time = "1 minute"
}
# Settings for doing document classification.
#
# This works by learning from existing documents. A collective can
# specify a tag category and the system will try to predict a tag
# from this category for new incoming documents.
#
# This requires a satstical model that is computed from all
# existing documents. This process is run periodically as
# configured by the collective. It may require a lot of memory,
# depending on the amount of data.
#
# It utilises this NLP library: https://nlp.stanford.edu/.
classification {
# Whether to enable classification globally. Each collective can
# decide to disable it. If it is disabled here, no collective
# can use classification.
enabled = true
# If concerned with memory consumption, this restricts the
# number of items to consider. More are better for training. A
# negative value or zero means no train on all items.
item-count = 0
# These settings are used to configure the classifier. If
# multiple are given, they are all tried and the "best" is
# chosen at the end. See
# https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
# for more info about these settings. The settings here yielded
# good results with *my* dataset.
#
# Enclose regexps in triple quotes.
classifiers = [
{ "useSplitWords" = "true"
"splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
"splitWordsIgnoreRegexp" = """\s+"""
"useSplitPrefixSuffixNGrams" = "true"
"maxNGramLeng" = "4"
"minNGramLeng" = "1"
"splitWordShape" = "chris4"
"intern" = "true" # makes it slower but saves memory
}
]
}
}
# Configuration for converting files into PDFs.

View File

@ -2,7 +2,10 @@ package docspell.joex
import java.nio.file.Path
import cats.data.NonEmptyList
import docspell.analysis.TextAnalysisConfig
import docspell.analysis.nlp.TextClassifierConfig
import docspell.backend.Config.Files
import docspell.common._
import docspell.convert.ConvertConfig
@ -57,15 +60,30 @@ object Config {
case class TextAnalysis(
maxLength: Int,
workingDir: Path,
regexNer: RegexNer
regexNer: RegexNer,
classification: Classification
) {
def textAnalysisConfig: TextAnalysisConfig =
TextAnalysisConfig(maxLength)
TextAnalysisConfig(
maxLength,
TextClassifierConfig(
workingDir,
NonEmptyList
.fromList(classification.classifiers)
.getOrElse(NonEmptyList.of(Map.empty))
)
)
def regexNerFileConfig: RegexNerFile.Config =
RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
}
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
case class Classification(
enabled: Boolean,
itemCount: Int,
classifiers: List[Map[String, String]]
)
}

View File

@ -14,6 +14,7 @@ import docspell.ftssolr.SolrFtsClient
import docspell.joex.analysis.RegexNerFile
import docspell.joex.fts.{MigrationTask, ReIndexTask}
import docspell.joex.hk._
import docspell.joex.learn.LearnClassifierTask
import docspell.joex.notify._
import docspell.joex.pdfconv.ConvertAllPdfTask
import docspell.joex.pdfconv.PdfConvTask
@ -159,6 +160,13 @@ object JoexAppImpl {
ConvertAllPdfTask.onCancel[F]
)
)
.withTask(
JobTask.json(
LearnClassifierArgs.taskName,
LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser),
LearnClassifierTask.onCancel[F]
)
)
.resource
psch <- PeriodicScheduler.create(
cfg.periodicScheduler,

View File

@ -0,0 +1,111 @@
package docspell.joex.learn
import cats.data.Kleisli
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import fs2.{Pipe, Stream}
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.ClassifierModel
import docspell.analysis.nlp.TextClassifier.Data
import docspell.backend.ops.OCollective
import docspell.common._
import docspell.joex.Config
import docspell.joex.scheduler._
import docspell.store.queries.QItem
import docspell.store.records.RClassifierSetting
import bitpeace.MimetypeHint
object LearnClassifierTask {
val noClass = "__NONE__"
val pageSep = " --n-- "
type Args = LearnClassifierArgs
def onCancel[F[_]: Sync]: Task[F, Args, Unit] =
Task.log(_.warn("Cancelling learn-classifier task"))
def apply[F[_]: Sync: ContextShift](
cfg: Config.TextAnalysis,
blocker: Blocker,
analyser: TextAnalyser[F]
): Task[F, Args, Unit] =
Task { ctx =>
(for {
sett <- findActiveSettings[F](ctx, cfg)
data = selectItems(
ctx,
math.min(cfg.classification.itemCount, sett.itemCount).toLong,
sett.category.getOrElse("")
)
_ <- OptionT.liftF(
analyser
.classifier(blocker)
.trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker)))
)
} yield ())
.getOrElseF(logInactiveWarning(ctx.logger))
}
private def handleModel[F[_]: Sync: ContextShift](
ctx: Context[F, Args],
blocker: Blocker
)(trainedModel: ClassifierModel): F[Unit] =
for {
oldFile <- ctx.store.transact(
RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId))
)
_ <- ctx.logger.info("Storing new trained model")
fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096)
newFile <-
ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
_ <- ctx.store.transact(
RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id))
)
_ <- ctx.logger.debug(s"New model stored at file ${newFile.id}")
_ <- oldFile match {
case Some(fid) =>
ctx.logger.debug(s"Deleting old model file ${fid.id}") *>
ctx.store.bitpeace.delete(fid.id).compile.drain
case None => ().pure[F]
}
} yield ()
private def selectItems[F[_]](
ctx: Context[F, Args],
max: Long,
category: String
): Stream[F, Data] = {
val connStream =
for {
item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
tt <- Stream.eval(
QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
)
} yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
ctx.store.transact(connStream.filter(_.text.nonEmpty))
}
private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] =
if (max <= 0) identity
else _.take(max)
private def findActiveSettings[F[_]: Sync](
ctx: Context[F, Args],
cfg: Config.TextAnalysis
): OptionT[F, OCollective.Classifier] =
if (cfg.classification.enabled)
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
.filter(_.enabled)
.filter(_.category.nonEmpty)
.map(OCollective.Classifier.fromRecord)
else
OptionT.none
private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
logger.warn(
"Classification is disabled. Check joex config and the collective settings."
)
}

View File

@ -38,6 +38,9 @@ case class ItemData(
copy(metas = next)
}
def appendTags(tags: Seq[String]): ItemData =
copy(tags = (this.tags ++ tags.toList).distinct)
def changeMeta(
attachId: Ident,
f: RAttachmentMeta => RAttachmentMeta

View File

@ -34,12 +34,12 @@ object ProcessItem {
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
def analysisOnly[F[_]: Sync](
def analysisOnly[F[_]: Sync: ContextShift](
cfg: Config,
analyser: TextAnalyser[F],
regexNer: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](analyser, regexNer)(item)
TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item)
.flatMap(FindProposal[F](cfg.processing))
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])

View File

@ -1,23 +1,33 @@
package docspell.joex.process
import cats.data.OptionT
import cats.effect._
import cats.implicits._
import docspell.analysis.TextAnalyser
import docspell.analysis.nlp.StanfordSettings
import docspell.analysis.nlp.ClassifierModel
import docspell.analysis.nlp.StanfordNerSettings
import docspell.analysis.nlp.TextClassifier
import docspell.common._
import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
import docspell.joex.learn.LearnClassifierTask
import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Context
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
import docspell.store.records.RClassifierSetting
import bitpeace.RangeDef
object TextAnalysis {
type Args = ProcessItemArgs
def apply[F[_]: Sync](
def apply[F[_]: Sync: ContextShift](
cfg: Config.TextAnalysis,
analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
)(item: ItemData): Task[F, Args, ItemData] =
Task { ctx =>
for {
_ <- ctx.logger.info("Starting text analysis")
@ -34,15 +44,18 @@ object TextAnalysis {
e <- s
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
v = t.toVector
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value
} yield item
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
.appendTags(tag.toSeq)
}
def annotateAttachment[F[_]: Sync](
ctx: Context[F, ProcessItemArgs],
ctx: Context[F, Args],
analyser: TextAnalyser[F],
nerFile: RegexNerFile[F]
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
val settings = StanfordSettings(ctx.args.meta.language, false, None)
val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
for {
customNer <- nerFile.makeFile(ctx.args.meta.collective)
sett = settings.copy(regexNer = customNer)
@ -54,4 +67,42 @@ object TextAnalysis {
)
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
}
def predictTag[F[_]: Sync: ContextShift](
ctx: Context[F, Args],
cfg: Config.TextAnalysis,
metas: Vector[RAttachmentMeta],
classifier: TextClassifier[F]
): OptionT[F, String] =
for {
model <- findActiveModel(ctx, cfg)
_ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
modelData =
ctx.store.bitpeace
.get(model.id)
.unNoneTerminate
.through(ctx.store.bitpeace.fetchData2(RangeDef.all))
cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
val modelFile = dir.resolve("model.ser.gz")
modelData
.through(fs2.io.file.writeAll(modelFile, ctx.blocker))
.compile
.drain
.flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
}).filter(_ != LearnClassifierTask.noClass)
_ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
} yield cls
private def findActiveModel[F[_]: Sync](
ctx: Context[F, Args],
cfg: Config.TextAnalysis
): OptionT[F, Ident] =
if (cfg.classification.enabled)
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
.filter(_.enabled)
.mapFilter(_.fileId)
else
OptionT.none
}

View File

@ -1047,6 +1047,28 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ContactList"
/sec/collective/classifier/startonce:
post:
tags: [ Collective ]
summary: Starts the learn-classifier task
description: |
If the collective has classification enabled, this will submit
the task for learning a classifier from existing data. This
task is usally run periodically as determined by the
collective settings.
The request is empty, settings are used from the collective.
security:
- authTokenHeader: []
responses:
200:
description: Ok
content:
application/json:
schema:
$ref: "#/components/schemas/BasicResult"
/sec/user:
get:
tags: [ Collective ]
@ -3643,12 +3665,14 @@ components:
description: DateTime
type: integer
format: date-time
CollectiveSettings:
description: |
Settings for a collective.
required:
- language
- integrationEnabled
- classifier
properties:
language:
type: string
@ -3658,6 +3682,31 @@ components:
description: |
Whether the collective has the integration endpoint
enabled.
classifier:
$ref: "#/components/schemas/ClassifierSetting"
ClassifierSetting:
description: |
Settings for learning a document classifier.
required:
- enabled
- schedule
- itemCount
properties:
enabled:
type: boolean
category:
type: string
itemCount:
type: integer
format: int32
description: |
The max. number of items to learn from. The newest items
are considered.
schedule:
type: string
format: calevent
SourceList:
description: |
A list of sources.

View File

@ -10,6 +10,7 @@ import docspell.restapi.model._
import docspell.restserver.conv.Conversions
import docspell.restserver.http4s._
import com.github.eikek.calev.CalEvent
import org.http4s.HttpRoutes
import org.http4s.circe.CirceEntityDecoder._
import org.http4s.circe.CirceEntityEncoder._
@ -37,7 +38,18 @@ object CollectiveRoutes {
case req @ POST -> Root / "settings" =>
for {
settings <- req.as[CollectiveSettings]
sett = OCollective.Settings(settings.language, settings.integrationEnabled)
sett = OCollective.Settings(
settings.language,
settings.integrationEnabled,
Some(
OCollective.Classifier(
settings.classifier.enabled,
settings.classifier.schedule,
settings.classifier.itemCount,
settings.classifier.category
)
)
)
res <-
backend.collective
.updateSettings(user.account.collective, sett)
@ -46,8 +58,21 @@ object CollectiveRoutes {
case GET -> Root / "settings" =>
for {
collDb <- backend.collective.find(user.account.collective)
sett = collDb.map(c => CollectiveSettings(c.language, c.integrationEnabled))
settDb <- backend.collective.findSettings(user.account.collective)
sett = settDb.map(c =>
CollectiveSettings(
c.language,
c.integrationEnabled,
ClassifierSetting(
c.classifier.map(_.enabled).getOrElse(false),
c.classifier.flatMap(_.category),
c.classifier.map(_.itemCount).getOrElse(0),
c.classifier
.map(_.schedule)
.getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00"))
)
)
)
resp <- sett.toResponse()
} yield resp
@ -63,6 +88,12 @@ object CollectiveRoutes {
resp <- Ok(ContactList(res.map(Conversions.mkContact)))
} yield resp
case POST -> Root / "classifier" / "startonce" =>
for {
_ <- backend.collective.startLearnClassifier(user.account.collective)
resp <- Ok(BasicResult(true, "Task submitted"))
} yield resp
case GET -> Root =>
for {
collDb <- backend.collective.find(user.account.collective)

View File

@ -0,0 +1,9 @@
CREATE TABLE `classifier_setting` (
`cid` varchar(254) not null primary key,
`enabled` boolean not null,
`schedule` varchar(254) not null,
`category` varchar(254) not null,
`file_id` varchar(254),
`created` timestamp not null,
foreign key (`cid`) references `collective`(`cid`)
);

View File

@ -0,0 +1,11 @@
CREATE TABLE "classifier_setting" (
"cid" varchar(254) not null primary key,
"enabled" boolean not null,
"schedule" varchar(254) not null,
"category" varchar(254) not null,
"item_count" int not null,
"file_id" varchar(254),
"created" timestamp not null,
foreign key ("cid") references "collective"("cid"),
foreign key ("file_id") references "filemeta"("id")
);

View File

@ -67,8 +67,8 @@ trait DoobieSyntax {
Fragment.const(" FROM ") ++ table ++ this.where(where)
def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment =
Fragment.const("SELECT DISTINCT(") ++ commas(cols.map(_.f)) ++
Fragment.const(") FROM ") ++ table ++ this.where(where)
Fragment.const("SELECT DISTINCT ") ++ commas(cols.map(_.f)) ++
Fragment.const(" FROM ") ++ table ++ this.where(where)
def selectCount(col: Column, table: Fragment, where: Fragment): Fragment =
Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this

View File

@ -7,6 +7,7 @@ import cats.effect.concurrent.Ref
import cats.implicits._
import fs2.Stream
import docspell.common.syntax.all._
import docspell.common.{IdRef, _}
import docspell.store.Store
import docspell.store.impl.Implicits._
@ -615,4 +616,75 @@ object QItem {
.query[NameAndNotes]
.streamWithChunkSize(chunkSize)
}
def findAllNewesFirst(
collective: Ident,
chunkSize: Int
): Stream[ConnectionIO, Ident] = {
val cols = Seq(RItem.Columns.id)
(selectSimple(cols, RItem.table, RItem.Columns.cid.is(collective)) ++
orderBy(RItem.Columns.created.desc))
.query[Ident]
.streamWithChunkSize(chunkSize)
}
case class TagName(id: Ident, name: String)
case class TextAndTag(itemId: Ident, text: String, tag: Option[TagName])
def resolveTextAndTag(
collective: Ident,
itemId: Ident,
tagCategory: String,
pageSep: String
): ConnectionIO[TextAndTag] = {
val aId = RAttachment.Columns.id.prefix("a")
val aItem = RAttachment.Columns.itemId.prefix("a")
val mId = RAttachmentMeta.Columns.id.prefix("m")
val mText = RAttachmentMeta.Columns.content.prefix("m")
val tiItem = RTagItem.Columns.itemId.prefix("ti")
val tiTag = RTagItem.Columns.tagId.prefix("ti")
val tId = RTag.Columns.tid.prefix("t")
val tName = RTag.Columns.name.prefix("t")
val tCat = RTag.Columns.category.prefix("t")
val iId = RItem.Columns.id.prefix("i")
val iColl = RItem.Columns.cid.prefix("i")
val cte = withCTE(
"tags" -> selectSimple(
Seq(tiItem, tId, tName),
RTagItem.table ++ fr"ti INNER JOIN" ++
RTag.table ++ fr"t ON" ++ tId.is(tiTag),
and(tiItem.is(itemId), tCat.is(tagCategory))
)
)
val cols = Seq(mText, tId, tName)
val from = RItem.table ++ fr"i INNER JOIN" ++
RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ fr"INNER JOIN" ++
RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ fr"LEFT JOIN" ++
fr"tags t ON" ++ RTagItem.Columns.itemId.prefix("t").is(iId)
val where =
and(
iId.is(itemId),
iColl.is(collective),
mText.isNotNull,
mText.isNot("")
)
val q = cte ++ selectDistinct(cols, from, where)
for {
_ <- logger.ftrace[ConnectionIO](
s"query: $q (${itemId.id}, ${collective.id}, ${tagCategory})"
)
texts <- q.query[(String, Option[TagName])].to[List]
_ <- logger.ftrace[ConnectionIO](
s"Got ${texts.size} text and tag entries for item ${itemId.id}"
)
tag = texts.headOption.flatMap(_._2)
txt = texts.map(_._1).mkString(pageSep)
} yield TextAndTag(itemId, txt, tag)
}
}

View File

@ -0,0 +1,113 @@
package docspell.store.records
import cats.implicits._
import docspell.common._
import docspell.store.impl.Implicits._
import docspell.store.impl._
import com.github.eikek.calev._
import doobie._
import doobie.implicits._
case class RClassifierSetting(
cid: Ident,
enabled: Boolean,
schedule: CalEvent,
category: String,
itemCount: Int,
fileId: Option[Ident],
created: Timestamp
) {}
object RClassifierSetting {
val table = fr"classifier_setting"
object Columns {
val cid = Column("cid")
val enabled = Column("enabled")
val schedule = Column("schedule")
val category = Column("category")
val itemCount = Column("item_count")
val fileId = Column("file_id")
val created = Column("created")
val all = List(cid, enabled, schedule, category, itemCount, fileId, created)
}
import Columns._
def insert(v: RClassifierSetting): ConnectionIO[Int] = {
val sql =
insertRow(
table,
all,
fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}"
)
sql.update.run
}
def updateAll(v: RClassifierSetting): ConnectionIO[Int] = {
val sql = updateRow(
table,
cid.is(v.cid),
commas(
enabled.setTo(v.enabled),
schedule.setTo(v.schedule),
category.setTo(v.category),
itemCount.setTo(v.itemCount),
fileId.setTo(v.fileId)
)
)
sql.update.run
}
def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] =
updateRow(table, cid.is(coll), fileId.setTo(fid)).update.run
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
for {
n1 <- updateRow(
table,
cid.is(v.cid),
commas(
enabled.setTo(v.enabled),
schedule.setTo(v.schedule),
itemCount.setTo(v.itemCount),
category.setTo(v.category)
)
).update.run
n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
} yield n1 + n2
def findById(id: Ident): ConnectionIO[Option[RClassifierSetting]] = {
val sql = selectSimple(all, table, cid.is(id))
sql.query[RClassifierSetting].option
}
def delete(coll: Ident): ConnectionIO[Int] =
deleteFrom(table, cid.is(coll)).update.run
case class Classifier(
enabled: Boolean,
schedule: CalEvent,
itemCount: Int,
category: Option[String]
) {
def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
RClassifierSetting(
coll,
enabled,
schedule,
category.getOrElse(""),
itemCount,
None,
created
)
}
object Classifier {
def fromRecord(r: RClassifierSetting): Classifier =
Classifier(r.enabled, r.schedule, r.itemCount, r.category.some)
}
}

View File

@ -61,14 +61,47 @@ object RCollective {
updateRow(table, id.is(cid), language.setTo(lang)).update.run
def updateSettings(cid: Ident, settings: Settings): ConnectionIO[Int] =
updateRow(
table,
id.is(cid),
commas(
language.setTo(settings.language),
integration.setTo(settings.integrationEnabled)
)
).update.run
for {
n1 <- updateRow(
table,
id.is(cid),
commas(
language.setTo(settings.language),
integration.setTo(settings.integrationEnabled)
)
).update.run
cls <-
Timestamp
.current[ConnectionIO]
.map(now => settings.classifier.map(_.toRecord(cid, now)))
n2 <- cls match {
case Some(cr) =>
RClassifierSetting.updateSettings(cr)
case None =>
RClassifierSetting.delete(cid)
}
} yield n1 + n2
def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
val cId = id.prefix("c")
val CS = RClassifierSetting.Columns
val csCid = CS.cid.prefix("cs")
val cols = Seq(
language.prefix("c"),
integration.prefix("c"),
CS.enabled.prefix("cs"),
CS.schedule.prefix("cs"),
CS.itemCount.prefix("cs"),
CS.category.prefix("cs")
)
val from = table ++ fr"c LEFT JOIN" ++
RClassifierSetting.table ++ fr"cs ON" ++ csCid.is(cId)
selectSimple(cols, from, cId.is(coll))
.query[Settings]
.option
}
def findById(cid: Ident): ConnectionIO[Option[RCollective]] = {
val sql = selectSimple(all, table, id.is(cid))
@ -112,5 +145,10 @@ object RCollective {
selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option
}
case class Settings(language: Language, integrationEnabled: Boolean)
case class Settings(
language: Language,
integrationEnabled: Boolean,
classifier: Option[RClassifierSetting.Classifier]
)
}

View File

@ -88,6 +88,7 @@ module Api exposing
, setItemNotes
, setTags
, setUnconfirmed
, startClassifier
, startOnceNotifyDueItems
, startOnceScanMailbox
, startReIndex
@ -795,6 +796,19 @@ versionInfo flags receive =
--- Collective
startClassifier :
Flags
-> (Result Http.Error BasicResult -> msg)
-> Cmd msg
startClassifier flags receive =
Http2.authPost
{ url = flags.config.baseUrl ++ "/api/v1/sec/collective/classifier/startonce"
, account = getAccount flags
, body = Http.emptyBody
, expect = Http.expectJson receive Api.Model.BasicResult.decoder
}
getTagCloud : Flags -> (Result Http.Error TagCloud -> msg) -> Cmd msg
getTagCloud flags receive =
Http2.authGet

View File

@ -218,12 +218,12 @@ loginInfo model =
, menuEntry model
CollectiveSettingPage
[ i [ class "users circle icon" ] []
, text "Collective Settings"
, text "Collective Profile"
]
, menuEntry model
UserSettingPage
[ i [ class "user circle icon" ] []
, text "User Settings"
, text "User Profile"
]
, div [ class "divider" ] []
, menuEntry model

View File

@ -0,0 +1,204 @@
module Comp.ClassifierSettingsForm exposing
( Model
, Msg
, getSettings
, init
, update
, view
)
import Api
import Api.Model.ClassifierSetting exposing (ClassifierSetting)
import Api.Model.TagList exposing (TagList)
import Comp.CalEventInput
import Comp.FixedDropdown
import Comp.IntField
import Data.CalEvent exposing (CalEvent)
import Data.Flags exposing (Flags)
import Data.Validated exposing (Validated(..))
import Html exposing (..)
import Html.Attributes exposing (..)
import Html.Events exposing (onCheck)
import Http
import Util.Tag
type alias Model =
{ enabled : Bool
, categoryModel : Comp.FixedDropdown.Model String
, category : Maybe String
, scheduleModel : Comp.CalEventInput.Model
, schedule : Validated CalEvent
, itemCountModel : Comp.IntField.Model
, itemCount : Maybe Int
}
type Msg
= GetTagsResp (Result Http.Error TagList)
| ScheduleMsg Comp.CalEventInput.Msg
| ToggleEnabled
| CategoryMsg (Comp.FixedDropdown.Msg String)
| ItemCountMsg Comp.IntField.Msg
init : Flags -> ClassifierSetting -> ( Model, Cmd Msg )
init flags sett =
let
newSchedule =
Data.CalEvent.fromEvent sett.schedule
|> Maybe.withDefault Data.CalEvent.everyMonth
( cem, cec ) =
Comp.CalEventInput.init flags newSchedule
in
( { enabled = sett.enabled
, categoryModel = Comp.FixedDropdown.initString []
, category = sett.category
, scheduleModel = cem
, schedule = Data.Validated.Unknown newSchedule
, itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
, itemCount = Just sett.itemCount
}
, Cmd.batch
[ Api.getTags flags "" GetTagsResp
, Cmd.map ScheduleMsg cec
]
)
getSettings : Model -> Validated ClassifierSetting
getSettings model =
Data.Validated.map
(\sch ->
{ enabled = model.enabled
, category = model.category
, schedule =
Data.CalEvent.makeEvent sch
, itemCount = Maybe.withDefault 0 model.itemCount
}
)
model.schedule
update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
update flags msg model =
case msg of
GetTagsResp (Ok tl) ->
let
categories =
Util.Tag.getCategories tl.items
|> List.sort
in
( { model
| categoryModel = Comp.FixedDropdown.initString categories
, category =
if model.category == Nothing then
List.head categories
else
model.category
}
, Cmd.none
)
GetTagsResp (Err _) ->
( model, Cmd.none )
ScheduleMsg lmsg ->
let
( cm, cc, ce ) =
Comp.CalEventInput.update
flags
(Data.Validated.value model.schedule)
lmsg
model.scheduleModel
in
( { model
| scheduleModel = cm
, schedule = ce
}
, Cmd.map ScheduleMsg cc
)
ToggleEnabled ->
( { model | enabled = not model.enabled }
, Cmd.none
)
CategoryMsg lmsg ->
let
( mm, ma ) =
Comp.FixedDropdown.update lmsg model.categoryModel
in
( { model
| categoryModel = mm
, category =
if ma == Nothing then
model.category
else
ma
}
, Cmd.none
)
ItemCountMsg lmsg ->
let
( im, iv ) =
Comp.IntField.update lmsg model.itemCountModel
in
( { model
| itemCountModel = im
, itemCount = iv
}
, Cmd.none
)
view : Model -> Html Msg
view model =
div []
[ div
[ class "field"
]
[ div [ class "ui checkbox" ]
[ input
[ type_ "checkbox"
, onCheck (\_ -> ToggleEnabled)
, checked model.enabled
]
[]
, label [] [ text "Enable classification" ]
, span [ class "small-info" ]
[ text "Disable document classification if not needed."
]
]
]
, div [ class "ui basic segment" ]
[ text "Document classification tries to predict a tag for new incoming documents. This "
, text "works by learning from existing documents in order to find common patterns within "
, text "the text. The more documents you have correctly tagged, the better. Learning is done "
, text "periodically based on a schedule and you need to specify a tag-group that should "
, text "be used for learning."
]
, div [ class "field" ]
[ label [] [ text "Category" ]
, Html.map CategoryMsg
(Comp.FixedDropdown.viewString model.category
model.categoryModel
)
]
, Html.map ItemCountMsg
(Comp.IntField.viewWithInfo
"The maximum number of items to learn from, order by date newest first. Use 0 to mean all."
model.itemCount
"field"
model.itemCountModel
)
, div [ class "field" ]
[ label [] [ text "Schedule" ]
, Html.map ScheduleMsg
(Comp.CalEventInput.view "" (Data.Validated.value model.schedule) model.scheduleModel)
]
]

View File

@ -10,10 +10,12 @@ module Comp.CollectiveSettingsForm exposing
import Api
import Api.Model.BasicResult exposing (BasicResult)
import Api.Model.CollectiveSettings exposing (CollectiveSettings)
import Comp.ClassifierSettingsForm
import Comp.Dropdown
import Data.Flags exposing (Flags)
import Data.Language exposing (Language)
import Data.UiSettings exposing (UiSettings)
import Data.Validated exposing (Validated)
import Html exposing (..)
import Html.Attributes exposing (..)
import Html.Events exposing (onCheck, onClick, onInput)
@ -27,44 +29,60 @@ type alias Model =
, initSettings : CollectiveSettings
, fullTextConfirmText : String
, fullTextReIndexResult : Maybe BasicResult
, classifierModel : Comp.ClassifierSettingsForm.Model
, startClassifierResult : Maybe BasicResult
}
init : CollectiveSettings -> Model
init settings =
init : Flags -> CollectiveSettings -> ( Model, Cmd Msg )
init flags settings =
let
lang =
Data.Language.fromString settings.language
|> Maybe.withDefault Data.Language.German
( cm, cc ) =
Comp.ClassifierSettingsForm.init flags settings.classifier
in
{ langModel =
Comp.Dropdown.makeSingleList
{ makeOption =
\l ->
{ value = Data.Language.toIso3 l
, text = Data.Language.toName l
, additional = ""
}
, placeholder = ""
, options = Data.Language.all
, selected = Just lang
}
, intEnabled = settings.integrationEnabled
, initSettings = settings
, fullTextConfirmText = ""
, fullTextReIndexResult = Nothing
}
( { langModel =
Comp.Dropdown.makeSingleList
{ makeOption =
\l ->
{ value = Data.Language.toIso3 l
, text = Data.Language.toName l
, additional = ""
}
, placeholder = ""
, options = Data.Language.all
, selected = Just lang
}
, intEnabled = settings.integrationEnabled
, initSettings = settings
, fullTextConfirmText = ""
, fullTextReIndexResult = Nothing
, classifierModel = cm
, startClassifierResult = Nothing
}
, Cmd.map ClassifierSettingMsg cc
)
getSettings : Model -> CollectiveSettings
getSettings : Model -> Validated CollectiveSettings
getSettings model =
CollectiveSettings
(Comp.Dropdown.getSelected model.langModel
|> List.head
|> Maybe.map Data.Language.toIso3
|> Maybe.withDefault model.initSettings.language
Data.Validated.map
(\cls ->
{ language =
Comp.Dropdown.getSelected model.langModel
|> List.head
|> Maybe.map Data.Language.toIso3
|> Maybe.withDefault model.initSettings.language
, integrationEnabled = model.intEnabled
, classifier = cls
}
)
(Comp.ClassifierSettingsForm.getSettings
model.classifierModel
)
model.intEnabled
type Msg
@ -73,6 +91,10 @@ type Msg
| SetFullTextConfirm String
| TriggerReIndex
| TriggerReIndexResult (Result Http.Error BasicResult)
| ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg
| SaveSettings
| StartClassifierTask
| StartClassifierResp (Result Http.Error BasicResult)
update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings )
@ -85,22 +107,15 @@ update flags msg model =
nextModel =
{ model | langModel = m2 }
nextSettings =
if Comp.Dropdown.isDropdownChangeMsg m then
Just (getSettings nextModel)
else
Nothing
in
( nextModel, Cmd.map LangDropdownMsg c2, nextSettings )
( nextModel, Cmd.map LangDropdownMsg c2, Nothing )
ToggleIntegrationEndpoint ->
let
nextModel =
{ model | intEnabled = not model.intEnabled }
in
( nextModel, Cmd.none, Just (getSettings nextModel) )
( nextModel, Cmd.none, Nothing )
SetFullTextConfirm str ->
( { model | fullTextConfirmText = str }, Cmd.none, Nothing )
@ -138,12 +153,50 @@ update flags msg model =
, Nothing
)
ClassifierSettingMsg lmsg ->
let
( cm, cc ) =
Comp.ClassifierSettingsForm.update flags lmsg model.classifierModel
in
( { model
| classifierModel = cm
}
, Cmd.map ClassifierSettingMsg cc
, Nothing
)
SaveSettings ->
case getSettings model of
Data.Validated.Valid s ->
( model, Cmd.none, Just s )
_ ->
( model, Cmd.none, Nothing )
StartClassifierTask ->
( model, Api.startClassifier flags StartClassifierResp, Nothing )
StartClassifierResp (Ok br) ->
( { model | startClassifierResult = Just br }
, Cmd.none
, Nothing
)
StartClassifierResp (Err err) ->
( { model
| startClassifierResult =
Just (BasicResult False (Util.Http.errorToString err))
}
, Cmd.none
, Nothing
)
view : Flags -> UiSettings -> Model -> Html Msg
view flags settings model =
div
[ classList
[ ( "ui form", True )
[ ( "ui form error success", True )
, ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
, ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
]
@ -219,17 +272,62 @@ view flags settings model =
[ text "This starts a task that clears the full-text index and re-indexes all your data again."
, text "You must type OK before clicking the button to avoid accidental re-indexing."
]
, div
[ classList
[ ( "ui message", True )
, ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
, ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
, ( "hidden invisible", model.fullTextReIndexResult == Nothing )
]
]
[ Maybe.map .message model.fullTextReIndexResult
|> Maybe.withDefault ""
|> text
, renderResultMessage model.fullTextReIndexResult
]
, h3
[ classList
[ ( "ui dividing header", True )
, ( "invisible hidden", False )
]
]
[ text "Document Classifier"
]
, div
[ classList
[ ( "field", True )
, ( "invisible hidden", False )
]
]
[ Html.map ClassifierSettingMsg
(Comp.ClassifierSettingsForm.view model.classifierModel)
, div [ class "ui vertical segment" ]
[ button
[ classList
[ ( "ui small secondary basic button", True )
, ( "disabled", not model.classifierModel.enabled )
]
, title "Starts a task to train a classifier"
, onClick StartClassifierTask
]
[ text "Start now"
]
, renderResultMessage model.startClassifierResult
]
]
, div [ class "ui divider" ] []
, button
[ classList
[ ( "ui primary button", True )
, ( "disabled", getSettings model |> Data.Validated.isInvalid )
]
, onClick SaveSettings
]
[ text "Save"
]
]
renderResultMessage : Maybe BasicResult -> Html msg
renderResultMessage result =
div
[ classList
[ ( "ui message", True )
, ( "error", Maybe.map .success result == Just False )
, ( "success", Maybe.map .success result == Just True )
, ( "hidden invisible", result == Nothing )
]
]
[ Maybe.map .message result
|> Maybe.withDefault ""
|> text
]

View File

@ -1,5 +1,6 @@
module Data.Validated exposing
( Validated(..)
, isInvalid
, map
, map2
, map3
@ -14,6 +15,19 @@ type Validated a
| Unknown a
isInvalid : Validated a -> Bool
isInvalid v =
case v of
Valid _ ->
False
Invalid _ _ ->
True
Unknown _ ->
False
value : Validated a -> a
value va =
case va of

View File

@ -30,15 +30,21 @@ init flags =
let
( sm, sc ) =
Comp.SourceManage.init flags
( cm, cc ) =
Comp.CollectiveSettingsForm.init flags Api.Model.CollectiveSettings.empty
in
( { currentTab = Just InsightsTab
, sourceModel = sm
, userModel = Comp.UserManage.emptyModel
, settingsModel = Comp.CollectiveSettingsForm.init Api.Model.CollectiveSettings.empty
, settingsModel = cm
, insights = Api.Model.ItemInsights.empty
, submitResult = Nothing
}
, Cmd.map SourceMsg sc
, Cmd.batch
[ Cmd.map SourceMsg sc
, Cmd.map SettingsFormMsg cc
]
)

View File

@ -77,7 +77,13 @@ update flags msg model =
( model, Cmd.none )
CollectiveSettingsResp (Ok data) ->
( { model | settingsModel = Comp.CollectiveSettingsForm.init data }, Cmd.none )
let
( cm, cc ) =
Comp.CollectiveSettingsForm.init flags data
in
( { model | settingsModel = cm }
, Cmd.map SettingsFormMsg cc
)
CollectiveSettingsResp (Err _) ->
( model, Cmd.none )

View File

@ -185,10 +185,11 @@ viewSettings : Flags -> UiSettings -> Model -> List (Html Msg)
viewSettings flags settings model =
[ h2 [ class "ui header" ]
[ i [ class "cog icon" ] []
, text "Settings"
, text "Collective Settings"
]
, div [ class "ui segment" ]
[ Html.map SettingsFormMsg (Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
[ Html.map SettingsFormMsg
(Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
]
, div
[ classList

View File

@ -95,6 +95,21 @@ let
enabled = true;
file-cache-time = "1 minute";
};
classification = {
enabled = true;
item-count = 0;
classifiers = [
{ "useSplitWords" = "true";
"splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.'';
"splitWordsIgnoreRegexp" = ''\s+'';
"useSplitPrefixSuffixNGrams" = "true";
"maxNGramLeng" = "4";
"minNGramLeng" = "1";
"splitWordShape" = "chris4";
"intern" = "true";
}
];
};
working-dir = "/tmp/docspell-analysis";
};
processing = {
@ -736,6 +751,59 @@ in {
default = defaults.text-analysis.regex-ner;
description = "";
};
classification = mkOption {
type = types.submodule({
options = {
enabled = mkOption {
type = types.bool;
default = defaults.text-analysis.classification.enabled;
description = ''
Whether to enable classification globally. Each collective can
decide to disable it. If it is disabled here, no collective
can use classification.
'';
};
item-count = mkOption {
type = types.int;
default = defaults.text-analysis.classification.item-count;
description = ''
If concerned with memory consumption, this restricts the
number of items to consider. More are better for training. A
negative value or zero means no train on all items.
'';
};
classifiers = mkOption {
type = types.listOf types.attrs;
default = defaults.text-analysis.classification.classifiers;
description = ''
These settings are used to configure the classifier. If
multiple are given, they are all tried and the "best" is
chosen at the end. See
https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
for more info about these settings. The settings here yielded
good results with *my* dataset.
'';
};
};
});
default = defaults.text-analysis.classification;
description = ''
Settings for doing document classification.
This works by learning from existing documents. A collective can
specify a tag category and the system will try to predict a tag
from this category for new incoming documents.
This requires a satstical model that is computed from all
existing documents. This process is run periodically as
configured by the collective. It may require a lot of memory,
depending on the amount of data.
It utilises this NLP library: https://nlp.stanford.edu/.
'';
};
};
});
default = defaults.text-analysis;

View File

@ -67,7 +67,7 @@ Text is extracted from all files. For scanned documents/images, OCR is used by u
, { image = "img/analyze-feature.png"
, header = "Text Analysis"
, description = """
The extracted text is analyzed and is used to find properties that can be annotated to your documents automatically.
The extracted text is analyzed using ML techniques to find properties that can be annotated to your documents automatically.
"""
}
, { image = "img/filetype-feature.svg"

View File

@ -33,11 +33,26 @@ workflows, a tag category *state* may exist that includes tags like
"assignment" semantics. Docspell doesn't propose any workflow, but it
can help to implement some.
The tags are *not* taken into account when creating suggestions from
analyzed text yet. However, PDF files may contain metadata itself and
if there is a metadata *keywords* list, these keywords are matched
against the tags in the database. If they match, the item is tagged
automatically.
Docspell can try to predict a tag for new incoming documents
automatically based on your existing data. This requires to train an
algorithm. There are some caveats: the more data you have correctly
tagged, the better are the results. So it won't work well for maybe
the first 100 documents. Then the tags must somehow relate to a
pattern in the document text. Tags like *todo* or *waiting* probably
won't work, obviously. But the typical "document type" tag, like
*invoice* and *receipt* is a good fit! That is why you need to provide
a tag category so only sensible tags are being learned. The algorithm
goes through all your items and learns patterns in the text that
relate to the given tags. This training step can be run periodically,
as specified in your collective settings such that docspell keeps
learning from your already tagged data! More information about the
algorithm can be found in the config, where it is possible to
fine-tune this process.
Another way to have items tagged automatically is when an input PDF
file contains a list of keywords in its metadata section (this only
applies to PDF files). These keywords are then matched against the
tags in the database. If they match, the item is tagged with them.
## Organization and Person