Merge pull request #242 from eikek/classifier

Classifier
2025-09-15 21:46:53 +00:00 · 2020-09-02 20:50:13 +00:00
parent c3b62291ba afbe9554b6
commit 8cb78e3dbe
41 changed files with 1458 additions and 102 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@@ -7,18 +7,21 @@ import docspell.analysis.contact.Contact
 import docspell.analysis.date.DateFind
 import docspell.analysis.nlp.PipelineCache
 import docspell.analysis.nlp.StanfordNerClassifier
-import docspell.analysis.nlp.StanfordSettings
+import docspell.analysis.nlp.StanfordNerSettings
 import docspell.analysis.nlp.StanfordTextClassifier
 import docspell.analysis.nlp.TextClassifier
 import docspell.common._
 trait TextAnalyser[F[_]] {
  def annotate(
      logger: Logger[F],
-      settings: StanfordSettings,
+      settings: StanfordNerSettings,
      cacheKey: Ident,
      text: String
  ): F[TextAnalyser.Result]
  def classifier(blocker: Blocker)(implicit CS: ContextShift[F]): TextClassifier[F]
 }
 object TextAnalyser {
@@ -35,7 +38,7 @@ object TextAnalyser {
        new TextAnalyser[F] {
          def annotate(
              logger: Logger[F],
-              settings: StanfordSettings,
+              settings: StanfordNerSettings,
              cacheKey: Ident,
              text: String
          ): F[TextAnalyser.Result] =
@@ -48,6 +51,11 @@ object TextAnalyser {
              spans = NerLabelSpan.build(list)
            } yield Result(spans ++ list, dates)
          def classifier(blocker: Blocker)(implicit
              CS: ContextShift[F]
          ): TextClassifier[F] =
            new StanfordTextClassifier[F](cfg.classifier, blocker)
          private def textLimit(logger: Logger[F], text: String): F[String] =
            if (text.length <= cfg.maxLength) text.pure[F]
            else
@@ -56,7 +64,7 @@ object TextAnalyser {
                  s" Analysing only first ${cfg.maxLength} characters."
              ) *> text.take(cfg.maxLength).pure[F]
-          private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
+          private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
              : F[Vector[NerLabel]] =
            StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
@@ -1,5 +1,8 @@
 package docspell.analysis
 import docspell.analysis.nlp.TextClassifierConfig
 case class TextAnalysisConfig(
-    maxLength: Int
+    maxLength: Int,
    classifier: TextClassifierConfig
 )
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala
@@ -0,0 +1,5 @@
 package docspell.analysis.nlp
 import java.nio.file.Path
 case class ClassifierModel(model: Path)
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
@@ -19,7 +19,7 @@ import org.log4s.getLogger
  */
 trait PipelineCache[F[_]] {
-  def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
+  def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP]
 }
@@ -28,7 +28,7 @@ object PipelineCache {
  def none[F[_]: Applicative]: PipelineCache[F] =
    new PipelineCache[F] {
-      def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+      def obtain(ignored: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
        makeClassifier(settings).pure[F]
    }
@@ -38,7 +38,7 @@ object PipelineCache {
  final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
      extends PipelineCache[F] {
-    def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+    def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
      for {
        id  <- makeSettingsId(settings)
        nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
@@ -48,7 +48,7 @@ object PipelineCache {
        key: String,
        id: String,
        cache: Map[String, Entry],
-        settings: StanfordSettings
+        settings: StanfordNerSettings
    ): (Map[String, Entry], StanfordCoreNLP) =
      cache.get(key) match {
        case Some(entry) =>
@@ -68,7 +68,7 @@ object PipelineCache {
          (cache.updated(key, e), nlp)
      }
-    private def makeSettingsId(settings: StanfordSettings): F[String] = {
+    private def makeSettingsId(settings: StanfordNerSettings): F[String] = {
      val base = settings.copy(regexNer = None).toString
      val size: F[Long] =
        settings.regexNer match {
@@ -81,7 +81,7 @@ object PipelineCache {
    }
  }
-  private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
+  private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = {
    logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
    new StanfordCoreNLP(Properties.forSettings(settings))
  }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@@ -7,6 +7,9 @@ import docspell.common._
 object Properties {
  def fromMap(m: Map[String, String]): JProps =
    apply(m.toSeq: _*)
  def apply(ps: (String, String)*): JProps = {
    val p = new JProps()
    for ((k, v) <- ps)
@@ -14,7 +17,7 @@ object Properties {
    p
  }
-  def forSettings(settings: StanfordSettings): JProps = {
+  def forSettings(settings: StanfordNerSettings): JProps = {
    val regexNerFile = settings.regexNer
      .map(p => p.normalize().toAbsolutePath().toString())
    settings.lang match {
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@@ -25,7 +25,7 @@ object StanfordNerClassifier {
  def nerAnnotate[F[_]: Applicative](
      cacheKey: String,
      cache: PipelineCache[F]
-  )(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
+  )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
    cache
      .obtain(cacheKey, settings)
      .map(crf => runClassifier(crf, text))
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
@@ -19,4 +19,8 @@ import docspell.common._
  * as a last step to tag untagged tokens using the provided list of
  * regexps.
  */
-case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
+case class StanfordNerSettings(
    lang: Language,
    highRecall: Boolean,
    regexNer: Option[Path]
 )
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala
@@ -0,0 +1,153 @@
 package docspell.analysis.nlp
 import java.nio.file.Path
 import cats.effect._
 import cats.effect.concurrent.Ref
 import cats.implicits._
 import fs2.Stream
 import docspell.analysis.nlp.TextClassifier._
 import docspell.common._
 import edu.stanford.nlp.classify.ColumnDataClassifier
 final class StanfordTextClassifier[F[_]: Sync: ContextShift](
    cfg: TextClassifierConfig,
    blocker: Blocker
 ) extends TextClassifier[F] {
  def trainClassifier[A](
      logger: Logger[F],
      data: Stream[F, Data]
  )(handler: TextClassifier.Handler[F, A]): F[A] =
    File
      .withTempDir(cfg.workingDir, "trainclassifier")
      .use { dir =>
        for {
          rawData   <- writeDataFile(blocker, dir, data)
          _         <- logger.info(s"Learning from ${rawData.count} items.")
          trainData <- splitData(logger, rawData)
          scores    <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
          sorted = scores.sortBy(-_.score)
          res <- handler(sorted.head.model)
        } yield res
      }
  def classify(
      logger: Logger[F],
      model: ClassifierModel,
      text: String
  ): F[Option[String]] =
    Sync[F].delay {
      val cls = ColumnDataClassifier.getClassifier(
        model.model.normalize().toAbsolutePath().toString()
      )
      val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
      Option(cat)
    }
  // --- helpers
  def train(
      logger: Logger[F],
      in: TrainData,
      props: Map[String, String]
  ): F[TrainResult] =
    for {
      _ <- logger.debug(s"Training classifier from $props")
      res <- Sync[F].delay {
        val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
        cdc.trainClassifier(in.train.toString())
        val score = cdc.testClassifier(in.test.toString())
        TrainResult(score.first(), ClassifierModel(in.modelFile))
      }
      _ <- logger.debug(s"Trained with result $res")
    } yield res
  def splitData(logger: Logger[F], in: RawData): F[TrainData] = {
    val nTest = (in.count * 0.15).toLong
    val td =
      TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt"))
    val fileLines =
      fs2.io.file
        .readAll(in.file, blocker, 4096)
        .through(fs2.text.utf8Decode)
        .through(fs2.text.lines)
    for {
      _ <- logger.debug(
        s"Splitting raw data into test/train data. Testing with $nTest entries"
      )
      _ <-
        fileLines
          .take(nTest)
          .intersperse("\n")
          .through(fs2.text.utf8Encode)
          .through(fs2.io.file.writeAll(td.test, blocker))
          .compile
          .drain
      _ <-
        fileLines
          .drop(nTest)
          .intersperse("\n")
          .through(fs2.text.utf8Encode)
          .through(fs2.io.file.writeAll(td.train, blocker))
          .compile
          .drain
    } yield td
  }
  def writeDataFile(blocker: Blocker, dir: Path, data: Stream[F, Data]): F[RawData] = {
    val target = dir.resolve("rawdata")
    for {
      counter <- Ref.of[F, Long](0L)
      _ <-
        data
          .filter(_.text.nonEmpty)
          .map(d => s"${d.cls}\t${fixRef(d.ref)}\t${normalisedText(d.text)}")
          .evalTap(_ => counter.update(_ + 1))
          .intersperse("\r\n")
          .through(fs2.text.utf8Encode)
          .through(fs2.io.file.writeAll(target, blocker))
          .compile
          .drain
      lines <- counter.get
    } yield RawData(lines, target)
  }
  def normalisedText(text: String): String =
    text.replaceAll("[\n\r\t]+", " ")
  def fixRef(str: String): String =
    str.replace('\t', '_')
  def amendProps(
      trainData: TrainData,
      props: Map[String, String]
  ): Map[String, String] =
    prepend("2.", props) ++ Map(
      "trainFile"   -> trainData.train.normalize().toAbsolutePath().toString(),
      "testFile"    -> trainData.test.normalize().toAbsolutePath().toString(),
      "serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString()
    ).toList
  case class RawData(count: Long, file: Path)
  case class TrainData(train: Path, test: Path) {
    val modelFile = train.resolveSibling("model.ser.gz")
  }
  case class TrainResult(score: Double, model: ClassifierModel)
  def prepend(pre: String, data: Map[String, String]): Map[String, String] =
    data.toList
      .map({
        case (k, v) =>
          if (k.startsWith(pre)) (k, v)
          else (pre + k, v)
      })
      .toMap
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala
@@ -0,0 +1,25 @@
 package docspell.analysis.nlp
 import cats.data.Kleisli
 import fs2.Stream
 import docspell.analysis.nlp.TextClassifier.Data
 import docspell.common._
 trait TextClassifier[F[_]] {
  def trainClassifier[A](logger: Logger[F], data: Stream[F, Data])(
      handler: TextClassifier.Handler[F, A]
  ): F[A]
  def classify(logger: Logger[F], model: ClassifierModel, text: String): F[Option[String]]
 }
 object TextClassifier {
  type Handler[F[_], A] = Kleisli[F, ClassifierModel, A]
  case class Data(cls: String, ref: String, text: String)
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala
@@ -0,0 +1,10 @@
 package docspell.analysis.nlp
 import java.nio.file.Path
 import cats.data.NonEmptyList
 case class TextClassifierConfig(
    workingDir: Path,
    classifierConfigs: NonEmptyList[Map[String, String]]
 )
--- a/modules/analysis/src/test/resources/test.ser.gz
+++ b/modules/analysis/src/test/resources/test.ser.gz
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala
@@ -0,0 +1,76 @@
 package docspell.analysis.nlp
 import minitest._
 import cats.effect._
 import scala.concurrent.ExecutionContext
 import java.nio.file.Paths
 import cats.data.NonEmptyList
 import docspell.common._
 import fs2.Stream
 import cats.data.Kleisli
 import TextClassifier.Data
 object StanfordTextClassifierSuite extends SimpleTestSuite {
  val logger = Logger.log4s[IO](org.log4s.getLogger)
  implicit val CS = IO.contextShift(ExecutionContext.global)
  test("learn from data") {
    val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
    val data =
      Stream
        .emit(Data("invoice", "n", "this is your invoice   total $421"))
        .repeat
        .take(10)
        .zip(
          Stream
            .emit(Data("receipt", "n", "shopping receipt cheese cake bar"))
            .repeat
            .take(10)
        )
        .flatMap({
          case (a, b) =>
            Stream.emits(Seq(a, b))
        })
        .covary[IO]
    val modelExists =
      Blocker[IO].use { blocker =>
        val classifier = new StanfordTextClassifier[IO](cfg, blocker)
        classifier.trainClassifier[Boolean](logger, data)(
          Kleisli(result => File.existsNonEmpty[IO](result.model))
        )
      }
    assertEquals(modelExists.unsafeRunSync(), true)
  }
  test("run classifier") {
    val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
    val things = for {
      dir     <- File.withTempDir[IO](Paths.get("target"), "testcls")
      blocker <- Blocker[IO]
    } yield (dir, blocker)
    things
      .use {
        case (dir, blocker) =>
          val classifier = new StanfordTextClassifier[IO](cfg, blocker)
          val modelFile = dir.resolve("test.ser.gz")
          for {
            _ <-
              LenientUri
                .fromJava(getClass.getResource("/test.ser.gz"))
                .readURL[IO](4096, blocker)
                .through(fs2.io.file.writeAll(modelFile, blocker))
                .compile
                .drain
            model = ClassifierModel(modelFile)
            cat <- classifier.classify(logger, model, "there is receipt always")
            _ = assertEquals(cat, Some("receipt"))
          } yield ()
      }
      .unsafeRunSync()
  }
 }
--- a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
+++ b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
@@ -52,12 +52,12 @@ object BackendApp {
      queue          <- JobQueue(store)
      loginImpl      <- Login[F](store)
      signupImpl     <- OSignup[F](store)
-      collImpl       <- OCollective[F](store)
+      joexImpl       <- OJoex(JoexClient(httpClient), store)
      collImpl       <- OCollective[F](store, utStore, queue, joexImpl)
      sourceImpl     <- OSource[F](store)
      tagImpl        <- OTag[F](store)
      equipImpl      <- OEquipment[F](store)
      orgImpl        <- OOrganization(store)
      joexImpl       <- OJoex(JoexClient(httpClient), store)
      uploadImpl     <- OUpload(store, queue, cfg.files, joexImpl)
      nodeImpl       <- ONode(store)
      jobImpl        <- OJob(store, joexImpl)
--- a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
@@ -8,14 +8,21 @@ import docspell.backend.PasswordCrypt
 import docspell.backend.ops.OCollective._
 import docspell.common._
 import docspell.store.queries.QCollective
 import docspell.store.queue.JobQueue
 import docspell.store.records._
 import docspell.store.usertask.UserTask
 import docspell.store.usertask.UserTaskStore
 import docspell.store.{AddResult, Store}
 import com.github.eikek.calev._
 trait OCollective[F[_]] {
  def find(name: Ident): F[Option[RCollective]]
-  def updateSettings(collective: Ident, lang: OCollective.Settings): F[AddResult]
+  def updateSettings(collective: Ident, settings: OCollective.Settings): F[AddResult]
  def findSettings(collective: Ident): F[Option[OCollective.Settings]]
  def listUser(collective: Ident): F[Vector[RUser]]
@@ -43,6 +50,7 @@ trait OCollective[F[_]] {
  def findEnabledSource(sourceId: Ident): F[Option[RSource]]
  def startLearnClassifier(collective: Ident): F[Unit]
 }
 object OCollective {
@@ -55,6 +63,8 @@ object OCollective {
  type Settings = RCollective.Settings
  val Settings = RCollective.Settings
  type Classifier = RClassifierSetting.Classifier
  val Classifier = RClassifierSetting.Classifier
  sealed trait PassChangeResult
  object PassChangeResult {
@@ -91,7 +101,12 @@ object OCollective {
    }
  }
-  def apply[F[_]: Effect](store: Store[F]): Resource[F, OCollective[F]] =
+  def apply[F[_]: Effect](
      store: Store[F],
      uts: UserTaskStore[F],
      queue: JobQueue[F],
      joex: OJoex[F]
  ): Resource[F, OCollective[F]] =
    Resource.pure[F, OCollective[F]](new OCollective[F] {
      def find(name: Ident): F[Option[RCollective]] =
        store.transact(RCollective.findById(name))
@@ -101,6 +116,41 @@ object OCollective {
          .transact(RCollective.updateSettings(collective, sett))
          .attempt
          .map(AddResult.fromUpdate)
          .flatMap(res => updateLearnClassifierTask(collective, sett) *> res.pure[F])
      def updateLearnClassifierTask(coll: Ident, sett: Settings) =
        for {
          id <- Ident.randomId[F]
          on    = sett.classifier.map(_.enabled).getOrElse(false)
          timer = sett.classifier.map(_.schedule).getOrElse(CalEvent.unsafe(""))
          ut = UserTask(
            id,
            LearnClassifierArgs.taskName,
            on,
            timer,
            LearnClassifierArgs(coll)
          )
          _ <- uts.updateOneTask(AccountId(coll, LearnClassifierArgs.taskName), ut)
          _ <- joex.notifyAllNodes
        } yield ()
      def startLearnClassifier(collective: Ident): F[Unit] =
        for {
          id <- Ident.randomId[F]
          ut <- UserTask(
            id,
            LearnClassifierArgs.taskName,
            true,
            CalEvent(WeekdayComponent.All, DateEvent.All, TimeEvent.All),
            LearnClassifierArgs(collective)
          ).encode.toPeriodicTask(AccountId(collective, LearnClassifierArgs.taskName))
          job <- ut.toJob
          _   <- queue.insert(job)
          _   <- joex.notifyAllNodes
        } yield ()
      def findSettings(collective: Ident): F[Option[OCollective.Settings]] =
        store.transact(RCollective.getSettings(collective))
      def listUser(collective: Ident): F[Vector[RUser]] =
        store.transact(RUser.findAll(collective, _.login))
--- a/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala
+++ b/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala
@@ -0,0 +1,35 @@
 package docspell.common
 import docspell.common.syntax.all._
 import io.circe._
 import io.circe.generic.semiauto._
 /** Arguments to the classify-item task.
  *
  * This task is run periodically and learns from existing documents
  * to create a model for predicting tags of new documents. The user
  * must give a tag category as a subset of possible tags..
  */
 case class LearnClassifierArgs(
    collective: Ident
 ) {
  def makeSubject: String =
    "Learn tags"
 }
 object LearnClassifierArgs {
  val taskName = Ident.unsafe("learn-classifier")
  implicit val jsonEncoder: Encoder[LearnClassifierArgs] =
    deriveEncoder[LearnClassifierArgs]
  implicit val jsonDecoder: Decoder[LearnClassifierArgs] =
    deriveDecoder[LearnClassifierArgs]
  def parse(str: String): Either[Throwable, LearnClassifierArgs] =
    str.parseJsonAs[LearnClassifierArgs]
 }
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -271,6 +271,50 @@ docspell.joex {
      # file will be kept until a check for a state change is done.
      file-cache-time = "1 minute"
    }
    # Settings for doing document classification.
    #
    # This works by learning from existing documents. A collective can
    # specify a tag category and the system will try to predict a tag
    # from this category for new incoming documents.
    #
    # This requires a satstical model that is computed from all
    # existing documents. This process is run periodically as
    # configured by the collective. It may require a lot of memory,
    # depending on the amount of data.
    #
    # It utilises this NLP library: https://nlp.stanford.edu/.
    classification {
      # Whether to enable classification globally. Each collective can
      # decide to disable it. If it is disabled here, no collective
      # can use classification.
      enabled = true
      # If concerned with memory consumption, this restricts the
      # number of items to consider. More are better for training. A
      # negative value or zero means no train on all items.
      item-count = 0
      # These settings are used to configure the classifier. If
      # multiple are given, they are all tried and the "best" is
      # chosen at the end. See
      # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
      # for more info about these settings. The settings here yielded
      # good results with *my* dataset.
      #
      # Enclose regexps in triple quotes.
      classifiers = [
        { "useSplitWords" = "true"
          "splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
          "splitWordsIgnoreRegexp" = """\s+"""
          "useSplitPrefixSuffixNGrams" = "true"
          "maxNGramLeng" = "4"
          "minNGramLeng" = "1"
          "splitWordShape" = "chris4"
          "intern" = "true" # makes it slower but saves memory
        }
      ]
    }
  }
  # Configuration for converting files into PDFs.
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -2,7 +2,10 @@ package docspell.joex
 import java.nio.file.Path
 import cats.data.NonEmptyList
 import docspell.analysis.TextAnalysisConfig
 import docspell.analysis.nlp.TextClassifierConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
@@ -57,15 +60,30 @@ object Config {
  case class TextAnalysis(
      maxLength: Int,
      workingDir: Path,
-      regexNer: RegexNer
+      regexNer: RegexNer,
      classification: Classification
  ) {
    def textAnalysisConfig: TextAnalysisConfig =
-      TextAnalysisConfig(maxLength)
+      TextAnalysisConfig(
        maxLength,
        TextClassifierConfig(
          workingDir,
          NonEmptyList
            .fromList(classification.classifiers)
            .getOrElse(NonEmptyList.of(Map.empty))
        )
      )
    def regexNerFileConfig: RegexNerFile.Config =
      RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
  }
  case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
  case class Classification(
      enabled: Boolean,
      itemCount: Int,
      classifiers: List[Map[String, String]]
  )
 }
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@@ -14,6 +14,7 @@ import docspell.ftssolr.SolrFtsClient
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.fts.{MigrationTask, ReIndexTask}
 import docspell.joex.hk._
 import docspell.joex.learn.LearnClassifierTask
 import docspell.joex.notify._
 import docspell.joex.pdfconv.ConvertAllPdfTask
 import docspell.joex.pdfconv.PdfConvTask
@@ -159,6 +160,13 @@ object JoexAppImpl {
            ConvertAllPdfTask.onCancel[F]
          )
        )
        .withTask(
          JobTask.json(
            LearnClassifierArgs.taskName,
            LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser),
            LearnClassifierTask.onCancel[F]
          )
        )
        .resource
      psch <- PeriodicScheduler.create(
        cfg.periodicScheduler,
--- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@@ -0,0 +1,111 @@
 package docspell.joex.learn
 import cats.data.Kleisli
 import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
 import fs2.{Pipe, Stream}
 import docspell.analysis.TextAnalyser
 import docspell.analysis.nlp.ClassifierModel
 import docspell.analysis.nlp.TextClassifier.Data
 import docspell.backend.ops.OCollective
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.scheduler._
 import docspell.store.queries.QItem
 import docspell.store.records.RClassifierSetting
 import bitpeace.MimetypeHint
 object LearnClassifierTask {
  val noClass = "__NONE__"
  val pageSep = " --n-- "
  type Args = LearnClassifierArgs
  def onCancel[F[_]: Sync]: Task[F, Args, Unit] =
    Task.log(_.warn("Cancelling learn-classifier task"))
  def apply[F[_]: Sync: ContextShift](
      cfg: Config.TextAnalysis,
      blocker: Blocker,
      analyser: TextAnalyser[F]
  ): Task[F, Args, Unit] =
    Task { ctx =>
      (for {
        sett <- findActiveSettings[F](ctx, cfg)
        data = selectItems(
          ctx,
          math.min(cfg.classification.itemCount, sett.itemCount).toLong,
          sett.category.getOrElse("")
        )
        _ <- OptionT.liftF(
          analyser
            .classifier(blocker)
            .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker)))
        )
      } yield ())
        .getOrElseF(logInactiveWarning(ctx.logger))
    }
  private def handleModel[F[_]: Sync: ContextShift](
      ctx: Context[F, Args],
      blocker: Blocker
  )(trainedModel: ClassifierModel): F[Unit] =
    for {
      oldFile <- ctx.store.transact(
        RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId))
      )
      _ <- ctx.logger.info("Storing new trained model")
      fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096)
      newFile <-
        ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
      _ <- ctx.store.transact(
        RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id))
      )
      _ <- ctx.logger.debug(s"New model stored at file ${newFile.id}")
      _ <- oldFile match {
        case Some(fid) =>
          ctx.logger.debug(s"Deleting old model file ${fid.id}") *>
            ctx.store.bitpeace.delete(fid.id).compile.drain
        case None => ().pure[F]
      }
    } yield ()
  private def selectItems[F[_]](
      ctx: Context[F, Args],
      max: Long,
      category: String
  ): Stream[F, Data] = {
    val connStream =
      for {
        item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
        tt <- Stream.eval(
          QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
        )
      } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
    ctx.store.transact(connStream.filter(_.text.nonEmpty))
  }
  private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] =
    if (max <= 0) identity
    else _.take(max)
  private def findActiveSettings[F[_]: Sync](
      ctx: Context[F, Args],
      cfg: Config.TextAnalysis
  ): OptionT[F, OCollective.Classifier] =
    if (cfg.classification.enabled)
      OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
        .filter(_.enabled)
        .filter(_.category.nonEmpty)
        .map(OCollective.Classifier.fromRecord)
    else
      OptionT.none
  private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
    logger.warn(
      "Classification is disabled. Check joex config and the collective settings."
    )
 }
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
@@ -38,6 +38,9 @@ case class ItemData(
    copy(metas = next)
  }
  def appendTags(tags: Seq[String]): ItemData =
    copy(tags = (this.tags ++ tags.toList).distinct)
  def changeMeta(
      attachId: Ident,
      f: RAttachmentMeta => RAttachmentMeta
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@@ -34,12 +34,12 @@ object ProcessItem {
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
    processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
-  def analysisOnly[F[_]: Sync](
+  def analysisOnly[F[_]: Sync: ContextShift](
      cfg: Config,
      analyser: TextAnalyser[F],
      regexNer: RegexNerFile[F]
  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    TextAnalysis[F](analyser, regexNer)(item)
+    TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item)
      .flatMap(FindProposal[F](cfg.processing))
      .flatMap(EvalProposals[F])
      .flatMap(SaveProposals[F])
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,23 +1,33 @@
 package docspell.joex.process
 import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
 import docspell.analysis.TextAnalyser
-import docspell.analysis.nlp.StanfordSettings
+import docspell.analysis.nlp.ClassifierModel
 import docspell.analysis.nlp.StanfordNerSettings
 import docspell.analysis.nlp.TextClassifier
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.learn.LearnClassifierTask
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachmentMeta
 import docspell.store.records.RClassifierSetting
 import bitpeace.RangeDef
 object TextAnalysis {
  type Args = ProcessItemArgs
-  def apply[F[_]: Sync](
+  def apply[F[_]: Sync: ContextShift](
      cfg: Config.TextAnalysis,
      analyser: TextAnalyser[F],
      nerFile: RegexNerFile[F]
-  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
+  )(item: ItemData): Task[F, Args, ItemData] =
    Task { ctx =>
      for {
        _ <- ctx.logger.info("Starting text analysis")
@@ -34,15 +44,18 @@ object TextAnalysis {
        e <- s
        _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
        v = t.toVector
-      } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
+        tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value
      } yield item
        .copy(metas = v.map(_._1), dateLabels = v.map(_._2))
        .appendTags(tag.toSeq)
    }
  def annotateAttachment[F[_]: Sync](
-      ctx: Context[F, ProcessItemArgs],
+      ctx: Context[F, Args],
      analyser: TextAnalyser[F],
      nerFile: RegexNerFile[F]
  )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
-    val settings = StanfordSettings(ctx.args.meta.language, false, None)
+    val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
    for {
      customNer <- nerFile.makeFile(ctx.args.meta.collective)
      sett = settings.copy(regexNer = customNer)
@@ -54,4 +67,42 @@ object TextAnalysis {
      )
    } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
  }
  def predictTag[F[_]: Sync: ContextShift](
      ctx: Context[F, Args],
      cfg: Config.TextAnalysis,
      metas: Vector[RAttachmentMeta],
      classifier: TextClassifier[F]
  ): OptionT[F, String] =
    for {
      model <- findActiveModel(ctx, cfg)
      _     <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
      text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
      modelData =
        ctx.store.bitpeace
          .get(model.id)
          .unNoneTerminate
          .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
      cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
        val modelFile = dir.resolve("model.ser.gz")
        modelData
          .through(fs2.io.file.writeAll(modelFile, ctx.blocker))
          .compile
          .drain
          .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
      }).filter(_ != LearnClassifierTask.noClass)
      _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
    } yield cls
  private def findActiveModel[F[_]: Sync](
      ctx: Context[F, Args],
      cfg: Config.TextAnalysis
  ): OptionT[F, Ident] =
    if (cfg.classification.enabled)
      OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
        .filter(_.enabled)
        .mapFilter(_.fileId)
    else
      OptionT.none
 }
--- a/modules/restapi/src/main/resources/docspell-openapi.yml
+++ b/modules/restapi/src/main/resources/docspell-openapi.yml
@@ -1047,6 +1047,28 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ContactList"
  /sec/collective/classifier/startonce:
    post:
      tags: [ Collective ]
      summary: Starts the learn-classifier task
      description: |
        If the collective has classification enabled, this will submit
        the task for learning a classifier from existing data. This
        task is usally run periodically as determined by the
        collective settings.
        The request is empty, settings are used from the collective.
      security:
        - authTokenHeader: []
      responses:
        200:
          description: Ok
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/BasicResult"
  /sec/user:
    get:
      tags: [ Collective ]
@@ -3643,12 +3665,14 @@ components:
          description: DateTime
          type: integer
          format: date-time
    CollectiveSettings:
      description: |
        Settings for a collective.
      required:
        - language
        - integrationEnabled
        - classifier
      properties:
        language:
          type: string
@@ -3658,6 +3682,31 @@ components:
          description: |
            Whether the collective has the integration endpoint
            enabled.
        classifier:
          $ref: "#/components/schemas/ClassifierSetting"
    ClassifierSetting:
      description: |
        Settings for learning a document classifier.
      required:
        - enabled
        - schedule
        - itemCount
      properties:
        enabled:
          type: boolean
        category:
          type: string
        itemCount:
          type: integer
          format: int32
          description: |
            The max. number of items to learn from. The newest items
            are considered.
        schedule:
          type: string
          format: calevent
    SourceList:
      description: |
        A list of sources.
--- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
@@ -10,6 +10,7 @@ import docspell.restapi.model._
 import docspell.restserver.conv.Conversions
 import docspell.restserver.http4s._
 import com.github.eikek.calev.CalEvent
 import org.http4s.HttpRoutes
 import org.http4s.circe.CirceEntityDecoder._
 import org.http4s.circe.CirceEntityEncoder._
@@ -37,7 +38,18 @@ object CollectiveRoutes {
      case req @ POST -> Root / "settings" =>
        for {
          settings <- req.as[CollectiveSettings]
-          sett = OCollective.Settings(settings.language, settings.integrationEnabled)
+          sett = OCollective.Settings(
            settings.language,
            settings.integrationEnabled,
            Some(
              OCollective.Classifier(
                settings.classifier.enabled,
                settings.classifier.schedule,
                settings.classifier.itemCount,
                settings.classifier.category
              )
            )
          )
          res <-
            backend.collective
              .updateSettings(user.account.collective, sett)
@@ -46,8 +58,21 @@ object CollectiveRoutes {
      case GET -> Root / "settings" =>
        for {
-          collDb <- backend.collective.find(user.account.collective)
+          settDb <- backend.collective.findSettings(user.account.collective)
-          sett = collDb.map(c => CollectiveSettings(c.language, c.integrationEnabled))
+          sett = settDb.map(c =>
            CollectiveSettings(
              c.language,
              c.integrationEnabled,
              ClassifierSetting(
                c.classifier.map(_.enabled).getOrElse(false),
                c.classifier.flatMap(_.category),
                c.classifier.map(_.itemCount).getOrElse(0),
                c.classifier
                  .map(_.schedule)
                  .getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00"))
              )
            )
          )
          resp <- sett.toResponse()
        } yield resp
@@ -63,6 +88,12 @@ object CollectiveRoutes {
          resp <- Ok(ContactList(res.map(Conversions.mkContact)))
        } yield resp
      case POST -> Root / "classifier" / "startonce" =>
        for {
          _    <- backend.collective.startLearnClassifier(user.account.collective)
          resp <- Ok(BasicResult(true, "Task submitted"))
        } yield resp
      case GET -> Root =>
        for {
          collDb <- backend.collective.find(user.account.collective)
--- a/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql
+++ b/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql
@@ -0,0 +1,9 @@
 CREATE TABLE `classifier_setting` (
  `cid` varchar(254) not null primary key,
  `enabled` boolean not null,
  `schedule` varchar(254) not null,
  `category` varchar(254) not null,
  `file_id` varchar(254),
  `created` timestamp not null,
  foreign key (`cid`) references `collective`(`cid`)
 );
--- a/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql
@@ -0,0 +1,11 @@
 CREATE TABLE "classifier_setting" (
  "cid" varchar(254) not null primary key,
  "enabled" boolean not null,
  "schedule" varchar(254) not null,
  "category" varchar(254) not null,
  "item_count" int not null,
  "file_id" varchar(254),
  "created" timestamp not null,
  foreign key ("cid") references "collective"("cid"),
  foreign key ("file_id") references "filemeta"("id")
 );
--- a/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala
+++ b/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala
@@ -67,8 +67,8 @@ trait DoobieSyntax {
      Fragment.const(" FROM ") ++ table ++ this.where(where)
  def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment =
-    Fragment.const("SELECT DISTINCT(") ++ commas(cols.map(_.f)) ++
+    Fragment.const("SELECT DISTINCT ") ++ commas(cols.map(_.f)) ++
-      Fragment.const(") FROM ") ++ table ++ this.where(where)
+      Fragment.const(" FROM ") ++ table ++ this.where(where)
  def selectCount(col: Column, table: Fragment, where: Fragment): Fragment =
    Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this
--- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
@@ -7,6 +7,7 @@ import cats.effect.concurrent.Ref
 import cats.implicits._
 import fs2.Stream
 import docspell.common.syntax.all._
 import docspell.common.{IdRef, _}
 import docspell.store.Store
 import docspell.store.impl.Implicits._
@@ -615,4 +616,75 @@ object QItem {
      .query[NameAndNotes]
      .streamWithChunkSize(chunkSize)
  }
  def findAllNewesFirst(
      collective: Ident,
      chunkSize: Int
  ): Stream[ConnectionIO, Ident] = {
    val cols = Seq(RItem.Columns.id)
    (selectSimple(cols, RItem.table, RItem.Columns.cid.is(collective)) ++
      orderBy(RItem.Columns.created.desc))
      .query[Ident]
      .streamWithChunkSize(chunkSize)
  }
  case class TagName(id: Ident, name: String)
  case class TextAndTag(itemId: Ident, text: String, tag: Option[TagName])
  def resolveTextAndTag(
      collective: Ident,
      itemId: Ident,
      tagCategory: String,
      pageSep: String
  ): ConnectionIO[TextAndTag] = {
    val aId    = RAttachment.Columns.id.prefix("a")
    val aItem  = RAttachment.Columns.itemId.prefix("a")
    val mId    = RAttachmentMeta.Columns.id.prefix("m")
    val mText  = RAttachmentMeta.Columns.content.prefix("m")
    val tiItem = RTagItem.Columns.itemId.prefix("ti")
    val tiTag  = RTagItem.Columns.tagId.prefix("ti")
    val tId    = RTag.Columns.tid.prefix("t")
    val tName  = RTag.Columns.name.prefix("t")
    val tCat   = RTag.Columns.category.prefix("t")
    val iId    = RItem.Columns.id.prefix("i")
    val iColl  = RItem.Columns.cid.prefix("i")
    val cte = withCTE(
      "tags" -> selectSimple(
        Seq(tiItem, tId, tName),
        RTagItem.table ++ fr"ti INNER JOIN" ++
          RTag.table ++ fr"t ON" ++ tId.is(tiTag),
        and(tiItem.is(itemId), tCat.is(tagCategory))
      )
    )
    val cols = Seq(mText, tId, tName)
    val from = RItem.table ++ fr"i INNER JOIN" ++
      RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ fr"INNER JOIN" ++
      RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ fr"LEFT JOIN" ++
      fr"tags t ON" ++ RTagItem.Columns.itemId.prefix("t").is(iId)
    val where =
      and(
        iId.is(itemId),
        iColl.is(collective),
        mText.isNotNull,
        mText.isNot("")
      )
    val q = cte ++ selectDistinct(cols, from, where)
    for {
      _ <- logger.ftrace[ConnectionIO](
        s"query: $q  (${itemId.id}, ${collective.id}, ${tagCategory})"
      )
      texts <- q.query[(String, Option[TagName])].to[List]
      _ <- logger.ftrace[ConnectionIO](
        s"Got ${texts.size} text and tag entries for item ${itemId.id}"
      )
      tag = texts.headOption.flatMap(_._2)
      txt = texts.map(_._1).mkString(pageSep)
    } yield TextAndTag(itemId, txt, tag)
  }
 }
--- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
@@ -0,0 +1,113 @@
 package docspell.store.records
 import cats.implicits._
 import docspell.common._
 import docspell.store.impl.Implicits._
 import docspell.store.impl._
 import com.github.eikek.calev._
 import doobie._
 import doobie.implicits._
 case class RClassifierSetting(
    cid: Ident,
    enabled: Boolean,
    schedule: CalEvent,
    category: String,
    itemCount: Int,
    fileId: Option[Ident],
    created: Timestamp
 ) {}
 object RClassifierSetting {
  val table = fr"classifier_setting"
  object Columns {
    val cid       = Column("cid")
    val enabled   = Column("enabled")
    val schedule  = Column("schedule")
    val category  = Column("category")
    val itemCount = Column("item_count")
    val fileId    = Column("file_id")
    val created   = Column("created")
    val all       = List(cid, enabled, schedule, category, itemCount, fileId, created)
  }
  import Columns._
  def insert(v: RClassifierSetting): ConnectionIO[Int] = {
    val sql =
      insertRow(
        table,
        all,
        fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}"
      )
    sql.update.run
  }
  def updateAll(v: RClassifierSetting): ConnectionIO[Int] = {
    val sql = updateRow(
      table,
      cid.is(v.cid),
      commas(
        enabled.setTo(v.enabled),
        schedule.setTo(v.schedule),
        category.setTo(v.category),
        itemCount.setTo(v.itemCount),
        fileId.setTo(v.fileId)
      )
    )
    sql.update.run
  }
  def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] =
    updateRow(table, cid.is(coll), fileId.setTo(fid)).update.run
  def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
    for {
      n1 <- updateRow(
        table,
        cid.is(v.cid),
        commas(
          enabled.setTo(v.enabled),
          schedule.setTo(v.schedule),
          itemCount.setTo(v.itemCount),
          category.setTo(v.category)
        )
      ).update.run
      n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
    } yield n1 + n2
  def findById(id: Ident): ConnectionIO[Option[RClassifierSetting]] = {
    val sql = selectSimple(all, table, cid.is(id))
    sql.query[RClassifierSetting].option
  }
  def delete(coll: Ident): ConnectionIO[Int] =
    deleteFrom(table, cid.is(coll)).update.run
  case class Classifier(
      enabled: Boolean,
      schedule: CalEvent,
      itemCount: Int,
      category: Option[String]
  ) {
    def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
      RClassifierSetting(
        coll,
        enabled,
        schedule,
        category.getOrElse(""),
        itemCount,
        None,
        created
      )
  }
  object Classifier {
    def fromRecord(r: RClassifierSetting): Classifier =
      Classifier(r.enabled, r.schedule, r.itemCount, r.category.some)
  }
 }
--- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala
@@ -61,14 +61,47 @@ object RCollective {
    updateRow(table, id.is(cid), language.setTo(lang)).update.run
  def updateSettings(cid: Ident, settings: Settings): ConnectionIO[Int] =
-    updateRow(
+    for {
-      table,
+      n1 <- updateRow(
-      id.is(cid),
+        table,
-      commas(
+        id.is(cid),
-        language.setTo(settings.language),
+        commas(
-        integration.setTo(settings.integrationEnabled)
+          language.setTo(settings.language),
-      )
+          integration.setTo(settings.integrationEnabled)
-    ).update.run
+        )
      ).update.run
      cls <-
        Timestamp
          .current[ConnectionIO]
          .map(now => settings.classifier.map(_.toRecord(cid, now)))
      n2 <- cls match {
        case Some(cr) =>
          RClassifierSetting.updateSettings(cr)
        case None =>
          RClassifierSetting.delete(cid)
      }
    } yield n1 + n2
  def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
    val cId   = id.prefix("c")
    val CS    = RClassifierSetting.Columns
    val csCid = CS.cid.prefix("cs")
    val cols = Seq(
      language.prefix("c"),
      integration.prefix("c"),
      CS.enabled.prefix("cs"),
      CS.schedule.prefix("cs"),
      CS.itemCount.prefix("cs"),
      CS.category.prefix("cs")
    )
    val from = table ++ fr"c LEFT JOIN" ++
      RClassifierSetting.table ++ fr"cs ON" ++ csCid.is(cId)
    selectSimple(cols, from, cId.is(coll))
      .query[Settings]
      .option
  }
  def findById(cid: Ident): ConnectionIO[Option[RCollective]] = {
    val sql = selectSimple(all, table, id.is(cid))
@@ -112,5 +145,10 @@ object RCollective {
    selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option
  }
-  case class Settings(language: Language, integrationEnabled: Boolean)
+  case class Settings(
      language: Language,
      integrationEnabled: Boolean,
      classifier: Option[RClassifierSetting.Classifier]
  )
 }
--- a/modules/webapp/src/main/elm/Api.elm
+++ b/modules/webapp/src/main/elm/Api.elm
@@ -88,6 +88,7 @@ module Api exposing
    , setItemNotes
    , setTags
    , setUnconfirmed
    , startClassifier
    , startOnceNotifyDueItems
    , startOnceScanMailbox
    , startReIndex
@@ -795,6 +796,19 @@ versionInfo flags receive =
 --- Collective
 startClassifier :
    Flags
    -> (Result Http.Error BasicResult -> msg)
    -> Cmd msg
 startClassifier flags receive =
    Http2.authPost
        { url = flags.config.baseUrl ++ "/api/v1/sec/collective/classifier/startonce"
        , account = getAccount flags
        , body = Http.emptyBody
        , expect = Http.expectJson receive Api.Model.BasicResult.decoder
        }
 getTagCloud : Flags -> (Result Http.Error TagCloud -> msg) -> Cmd msg
 getTagCloud flags receive =
    Http2.authGet
--- a/modules/webapp/src/main/elm/App/View.elm
+++ b/modules/webapp/src/main/elm/App/View.elm
@@ -218,12 +218,12 @@ loginInfo model =
                        , menuEntry model
                            CollectiveSettingPage
                            [ i [ class "users circle icon" ] []
-                            , text "Collective Settings"
+                            , text "Collective Profile"
                            ]
                        , menuEntry model
                            UserSettingPage
                            [ i [ class "user circle icon" ] []
-                            , text "User Settings"
+                            , text "User Profile"
                            ]
                        , div [ class "divider" ] []
                        , menuEntry model
--- a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
+++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
@@ -0,0 +1,204 @@
 module Comp.ClassifierSettingsForm exposing
    ( Model
    , Msg
    , getSettings
    , init
    , update
    , view
    )
 import Api
 import Api.Model.ClassifierSetting exposing (ClassifierSetting)
 import Api.Model.TagList exposing (TagList)
 import Comp.CalEventInput
 import Comp.FixedDropdown
 import Comp.IntField
 import Data.CalEvent exposing (CalEvent)
 import Data.Flags exposing (Flags)
 import Data.Validated exposing (Validated(..))
 import Html exposing (..)
 import Html.Attributes exposing (..)
 import Html.Events exposing (onCheck)
 import Http
 import Util.Tag
 type alias Model =
    { enabled : Bool
    , categoryModel : Comp.FixedDropdown.Model String
    , category : Maybe String
    , scheduleModel : Comp.CalEventInput.Model
    , schedule : Validated CalEvent
    , itemCountModel : Comp.IntField.Model
    , itemCount : Maybe Int
    }
 type Msg
    = GetTagsResp (Result Http.Error TagList)
    | ScheduleMsg Comp.CalEventInput.Msg
    | ToggleEnabled
    | CategoryMsg (Comp.FixedDropdown.Msg String)
    | ItemCountMsg Comp.IntField.Msg
 init : Flags -> ClassifierSetting -> ( Model, Cmd Msg )
 init flags sett =
    let
        newSchedule =
            Data.CalEvent.fromEvent sett.schedule
                |> Maybe.withDefault Data.CalEvent.everyMonth
        ( cem, cec ) =
            Comp.CalEventInput.init flags newSchedule
    in
    ( { enabled = sett.enabled
      , categoryModel = Comp.FixedDropdown.initString []
      , category = sett.category
      , scheduleModel = cem
      , schedule = Data.Validated.Unknown newSchedule
      , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
      , itemCount = Just sett.itemCount
      }
    , Cmd.batch
        [ Api.getTags flags "" GetTagsResp
        , Cmd.map ScheduleMsg cec
        ]
    )
 getSettings : Model -> Validated ClassifierSetting
 getSettings model =
    Data.Validated.map
        (\sch ->
            { enabled = model.enabled
            , category = model.category
            , schedule =
                Data.CalEvent.makeEvent sch
            , itemCount = Maybe.withDefault 0 model.itemCount
            }
        )
        model.schedule
 update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
 update flags msg model =
    case msg of
        GetTagsResp (Ok tl) ->
            let
                categories =
                    Util.Tag.getCategories tl.items
                        |> List.sort
            in
            ( { model
                | categoryModel = Comp.FixedDropdown.initString categories
                , category =
                    if model.category == Nothing then
                        List.head categories
                    else
                        model.category
              }
            , Cmd.none
            )
        GetTagsResp (Err _) ->
            ( model, Cmd.none )
        ScheduleMsg lmsg ->
            let
                ( cm, cc, ce ) =
                    Comp.CalEventInput.update
                        flags
                        (Data.Validated.value model.schedule)
                        lmsg
                        model.scheduleModel
            in
            ( { model
                | scheduleModel = cm
                , schedule = ce
              }
            , Cmd.map ScheduleMsg cc
            )
        ToggleEnabled ->
            ( { model | enabled = not model.enabled }
            , Cmd.none
            )
        CategoryMsg lmsg ->
            let
                ( mm, ma ) =
                    Comp.FixedDropdown.update lmsg model.categoryModel
            in
            ( { model
                | categoryModel = mm
                , category =
                    if ma == Nothing then
                        model.category
                    else
                        ma
              }
            , Cmd.none
            )
        ItemCountMsg lmsg ->
            let
                ( im, iv ) =
                    Comp.IntField.update lmsg model.itemCountModel
            in
            ( { model
                | itemCountModel = im
                , itemCount = iv
              }
            , Cmd.none
            )
 view : Model -> Html Msg
 view model =
    div []
        [ div
            [ class "field"
            ]
            [ div [ class "ui checkbox" ]
                [ input
                    [ type_ "checkbox"
                    , onCheck (\_ -> ToggleEnabled)
                    , checked model.enabled
                    ]
                    []
                , label [] [ text "Enable classification" ]
                , span [ class "small-info" ]
                    [ text "Disable document classification if not needed."
                    ]
                ]
            ]
        , div [ class "ui basic segment" ]
            [ text "Document classification tries to predict a tag for new incoming documents. This "
            , text "works by learning from existing documents in order to find common patterns within "
            , text "the text. The more documents you have correctly tagged, the better. Learning is done "
            , text "periodically based on a schedule and you need to specify a tag-group that should "
            , text "be used for learning."
            ]
        , div [ class "field" ]
            [ label [] [ text "Category" ]
            , Html.map CategoryMsg
                (Comp.FixedDropdown.viewString model.category
                    model.categoryModel
                )
            ]
        , Html.map ItemCountMsg
            (Comp.IntField.viewWithInfo
                "The maximum number of items to learn from, order by date newest first. Use 0 to mean all."
                model.itemCount
                "field"
                model.itemCountModel
            )
        , div [ class "field" ]
            [ label [] [ text "Schedule" ]
            , Html.map ScheduleMsg
                (Comp.CalEventInput.view "" (Data.Validated.value model.schedule) model.scheduleModel)
            ]
        ]
--- a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm
+++ b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm
@@ -10,10 +10,12 @@ module Comp.CollectiveSettingsForm exposing
 import Api
 import Api.Model.BasicResult exposing (BasicResult)
 import Api.Model.CollectiveSettings exposing (CollectiveSettings)
 import Comp.ClassifierSettingsForm
 import Comp.Dropdown
 import Data.Flags exposing (Flags)
 import Data.Language exposing (Language)
 import Data.UiSettings exposing (UiSettings)
 import Data.Validated exposing (Validated)
 import Html exposing (..)
 import Html.Attributes exposing (..)
 import Html.Events exposing (onCheck, onClick, onInput)
@@ -27,44 +29,60 @@ type alias Model =
    , initSettings : CollectiveSettings
    , fullTextConfirmText : String
    , fullTextReIndexResult : Maybe BasicResult
    , classifierModel : Comp.ClassifierSettingsForm.Model
    , startClassifierResult : Maybe BasicResult
    }
-init : CollectiveSettings -> Model
+init : Flags -> CollectiveSettings -> ( Model, Cmd Msg )
-init settings =
+init flags settings =
    let
        lang =
            Data.Language.fromString settings.language
                |> Maybe.withDefault Data.Language.German
        ( cm, cc ) =
            Comp.ClassifierSettingsForm.init flags settings.classifier
    in
-    { langModel =
+    ( { langModel =
-        Comp.Dropdown.makeSingleList
+            Comp.Dropdown.makeSingleList
-            { makeOption =
+                { makeOption =
-                \l ->
+                    \l ->
-                    { value = Data.Language.toIso3 l
+                        { value = Data.Language.toIso3 l
-                    , text = Data.Language.toName l
+                        , text = Data.Language.toName l
-                    , additional = ""
+                        , additional = ""
-                    }
+                        }
-            , placeholder = ""
+                , placeholder = ""
-            , options = Data.Language.all
+                , options = Data.Language.all
-            , selected = Just lang
+                , selected = Just lang
-            }
+                }
-    , intEnabled = settings.integrationEnabled
+      , intEnabled = settings.integrationEnabled
-    , initSettings = settings
+      , initSettings = settings
-    , fullTextConfirmText = ""
+      , fullTextConfirmText = ""
-    , fullTextReIndexResult = Nothing
+      , fullTextReIndexResult = Nothing
-    }
+      , classifierModel = cm
      , startClassifierResult = Nothing
      }
    , Cmd.map ClassifierSettingMsg cc
    )
-getSettings : Model -> CollectiveSettings
+getSettings : Model -> Validated CollectiveSettings
 getSettings model =
-    CollectiveSettings
+    Data.Validated.map
-        (Comp.Dropdown.getSelected model.langModel
+        (\cls ->
-            |> List.head
+            { language =
-            |> Maybe.map Data.Language.toIso3
+                Comp.Dropdown.getSelected model.langModel
-            |> Maybe.withDefault model.initSettings.language
+                    |> List.head
                    |> Maybe.map Data.Language.toIso3
                    |> Maybe.withDefault model.initSettings.language
            , integrationEnabled = model.intEnabled
            , classifier = cls
            }
        )
        (Comp.ClassifierSettingsForm.getSettings
            model.classifierModel
        )
        model.intEnabled
 type Msg
@@ -73,6 +91,10 @@ type Msg
    | SetFullTextConfirm String
    | TriggerReIndex
    | TriggerReIndexResult (Result Http.Error BasicResult)
    | ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg
    | SaveSettings
    | StartClassifierTask
    | StartClassifierResp (Result Http.Error BasicResult)
 update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings )
@@ -85,22 +107,15 @@ update flags msg model =
                nextModel =
                    { model | langModel = m2 }
                nextSettings =
                    if Comp.Dropdown.isDropdownChangeMsg m then
                        Just (getSettings nextModel)
                    else
                        Nothing
            in
-            ( nextModel, Cmd.map LangDropdownMsg c2, nextSettings )
+            ( nextModel, Cmd.map LangDropdownMsg c2, Nothing )
        ToggleIntegrationEndpoint ->
            let
                nextModel =
                    { model | intEnabled = not model.intEnabled }
            in
-            ( nextModel, Cmd.none, Just (getSettings nextModel) )
+            ( nextModel, Cmd.none, Nothing )
        SetFullTextConfirm str ->
            ( { model | fullTextConfirmText = str }, Cmd.none, Nothing )
@@ -138,12 +153,50 @@ update flags msg model =
            , Nothing
            )
        ClassifierSettingMsg lmsg ->
            let
                ( cm, cc ) =
                    Comp.ClassifierSettingsForm.update flags lmsg model.classifierModel
            in
            ( { model
                | classifierModel = cm
              }
            , Cmd.map ClassifierSettingMsg cc
            , Nothing
            )
        SaveSettings ->
            case getSettings model of
                Data.Validated.Valid s ->
                    ( model, Cmd.none, Just s )
                _ ->
                    ( model, Cmd.none, Nothing )
        StartClassifierTask ->
            ( model, Api.startClassifier flags StartClassifierResp, Nothing )
        StartClassifierResp (Ok br) ->
            ( { model | startClassifierResult = Just br }
            , Cmd.none
            , Nothing
            )
        StartClassifierResp (Err err) ->
            ( { model
                | startClassifierResult =
                    Just (BasicResult False (Util.Http.errorToString err))
              }
            , Cmd.none
            , Nothing
            )
 view : Flags -> UiSettings -> Model -> Html Msg
 view flags settings model =
    div
        [ classList
-            [ ( "ui form", True )
+            [ ( "ui form error success", True )
            , ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
            , ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
            ]
@@ -219,17 +272,62 @@ view flags settings model =
                [ text "This starts a task that clears the full-text index and re-indexes all your data again."
                , text "You must type OK before clicking the button to avoid accidental re-indexing."
                ]
-            , div
+            , renderResultMessage model.fullTextReIndexResult
-                [ classList
+            ]
-                    [ ( "ui message", True )
+        , h3
-                    , ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
+            [ classList
-                    , ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
+                [ ( "ui dividing header", True )
-                    , ( "hidden invisible", model.fullTextReIndexResult == Nothing )
+                , ( "invisible hidden", False )
                    ]
                ]
                [ Maybe.map .message model.fullTextReIndexResult
                    |> Maybe.withDefault ""
                    |> text
                ]
            ]
            [ text "Document Classifier"
            ]
        , div
            [ classList
                [ ( "field", True )
                , ( "invisible hidden", False )
                ]
            ]
            [ Html.map ClassifierSettingMsg
                (Comp.ClassifierSettingsForm.view model.classifierModel)
            , div [ class "ui vertical segment" ]
                [ button
                    [ classList
                        [ ( "ui small secondary basic button", True )
                        , ( "disabled", not model.classifierModel.enabled )
                        ]
                    , title "Starts a task to train a classifier"
                    , onClick StartClassifierTask
                    ]
                    [ text "Start now"
                    ]
                , renderResultMessage model.startClassifierResult
                ]
            ]
        , div [ class "ui divider" ] []
        , button
            [ classList
                [ ( "ui primary button", True )
                , ( "disabled", getSettings model |> Data.Validated.isInvalid )
                ]
            , onClick SaveSettings
            ]
            [ text "Save"
            ]
        ]
 renderResultMessage : Maybe BasicResult -> Html msg
 renderResultMessage result =
    div
        [ classList
            [ ( "ui message", True )
            , ( "error", Maybe.map .success result == Just False )
            , ( "success", Maybe.map .success result == Just True )
            , ( "hidden invisible", result == Nothing )
            ]
        ]
        [ Maybe.map .message result
            |> Maybe.withDefault ""
            |> text
        ]
--- a/modules/webapp/src/main/elm/Data/Validated.elm
+++ b/modules/webapp/src/main/elm/Data/Validated.elm
@@ -1,5 +1,6 @@
 module Data.Validated exposing
    ( Validated(..)
    , isInvalid
    , map
    , map2
    , map3
@@ -14,6 +15,19 @@ type Validated a
    | Unknown a
 isInvalid : Validated a -> Bool
 isInvalid v =
    case v of
        Valid _ ->
            False
        Invalid _ _ ->
            True
        Unknown _ ->
            False
 value : Validated a -> a
 value va =
    case va of
--- a/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm
+++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm
@@ -30,15 +30,21 @@ init flags =
    let
        ( sm, sc ) =
            Comp.SourceManage.init flags
        ( cm, cc ) =
            Comp.CollectiveSettingsForm.init flags Api.Model.CollectiveSettings.empty
    in
    ( { currentTab = Just InsightsTab
      , sourceModel = sm
      , userModel = Comp.UserManage.emptyModel
-      , settingsModel = Comp.CollectiveSettingsForm.init Api.Model.CollectiveSettings.empty
+      , settingsModel = cm
      , insights = Api.Model.ItemInsights.empty
      , submitResult = Nothing
      }
-    , Cmd.map SourceMsg sc
+    , Cmd.batch
        [ Cmd.map SourceMsg sc
        , Cmd.map SettingsFormMsg cc
        ]
    )
--- a/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm
+++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm
@@ -77,7 +77,13 @@ update flags msg model =
            ( model, Cmd.none )
        CollectiveSettingsResp (Ok data) ->
-            ( { model | settingsModel = Comp.CollectiveSettingsForm.init data }, Cmd.none )
+            let
                ( cm, cc ) =
                    Comp.CollectiveSettingsForm.init flags data
            in
            ( { model | settingsModel = cm }
            , Cmd.map SettingsFormMsg cc
            )
        CollectiveSettingsResp (Err _) ->
            ( model, Cmd.none )
--- a/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm
+++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm
@@ -185,10 +185,11 @@ viewSettings : Flags -> UiSettings -> Model -> List (Html Msg)
 viewSettings flags settings model =
    [ h2 [ class "ui header" ]
        [ i [ class "cog icon" ] []
-        , text "Settings"
+        , text "Collective Settings"
        ]
    , div [ class "ui segment" ]
-        [ Html.map SettingsFormMsg (Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
+        [ Html.map SettingsFormMsg
            (Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
        ]
    , div
        [ classList
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -95,6 +95,21 @@ let
        enabled = true;
        file-cache-time = "1 minute";
      };
      classification = {
        enabled = true;
        item-count = 0;
        classifiers = [
          { "useSplitWords" = "true";
            "splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.'';
            "splitWordsIgnoreRegexp" = ''\s+'';
            "useSplitPrefixSuffixNGrams" = "true";
            "maxNGramLeng" = "4";
            "minNGramLeng" = "1";
            "splitWordShape" = "chris4";
            "intern" = "true";
          }
        ];
      };
      working-dir = "/tmp/docspell-analysis";
    };
    processing = {
@@ -736,6 +751,59 @@ in {
              default = defaults.text-analysis.regex-ner;
              description = "";
            };
            classification = mkOption {
              type = types.submodule({
                options = {
                  enabled = mkOption {
                    type = types.bool;
                    default = defaults.text-analysis.classification.enabled;
                    description = ''
                      Whether to enable classification globally. Each collective can
                      decide to disable it. If it is disabled here, no collective
                      can use classification.
                    '';
                  };
                  item-count = mkOption {
                    type = types.int;
                    default = defaults.text-analysis.classification.item-count;
                    description = ''
                      If concerned with memory consumption, this restricts the
                      number of items to consider. More are better for training. A
                      negative value or zero means no train on all items.
                    '';
                  };
                  classifiers = mkOption {
                    type = types.listOf types.attrs;
                    default = defaults.text-analysis.classification.classifiers;
                    description = ''
                      These settings are used to configure the classifier. If
                      multiple are given, they are all tried and the "best" is
                      chosen at the end. See
                      https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
                      for more info about these settings. The settings here yielded
                      good results with *my* dataset.
                    '';
                  };
                };
              });
              default = defaults.text-analysis.classification;
              description = ''
                Settings for doing document classification.
                This works by learning from existing documents. A collective can
                specify a tag category and the system will try to predict a tag
                from this category for new incoming documents.
                This requires a satstical model that is computed from all
                existing documents. This process is run periodically as
                configured by the collective. It may require a lot of memory,
                depending on the amount of data.
                It utilises this NLP library: https://nlp.stanford.edu/.
              '';
            };
          };
        });
        default = defaults.text-analysis;
--- a/website/elm/Feature.elm
+++ b/website/elm/Feature.elm
@@ -67,7 +67,7 @@ Text is extracted from all files. For scanned documents/images, OCR is used by u
    , { image = "img/analyze-feature.png"
      , header = "Text Analysis"
      , description = """
-The extracted text is analyzed and is used to find properties that can be annotated to your documents automatically.
+The extracted text is analyzed using ML techniques to find properties that can be annotated to your documents automatically.
 """
      }
    , { image = "img/filetype-feature.svg"
--- a/website/site/content/docs/webapp/metadata.md
+++ b/website/site/content/docs/webapp/metadata.md
@@ -33,11 +33,26 @@ workflows, a tag category *state* may exist that includes tags like
 "assignment" semantics. Docspell doesn't propose any workflow, but it
 can help to implement some.
-The tags are *not* taken into account when creating suggestions from
+Docspell can try to predict a tag for new incoming documents
-analyzed text yet. However, PDF files may contain metadata itself and
+automatically based on your existing data. This requires to train an
-if there is a metadata *keywords* list, these keywords are matched
+algorithm. There are some caveats: the more data you have correctly
-against the tags in the database. If they match, the item is tagged
+tagged, the better are the results. So it won't work well for maybe
-automatically.
+the first 100 documents. Then the tags must somehow relate to a
 pattern in the document text. Tags like *todo* or *waiting* probably
 won't work, obviously. But the typical "document type" tag, like
 *invoice* and *receipt* is a good fit! That is why you need to provide
 a tag category so only sensible tags are being learned. The algorithm
 goes through all your items and learns patterns in the text that
 relate to the given tags. This training step can be run periodically,
 as specified in your collective settings such that docspell keeps
 learning from your already tagged data! More information about the
 algorithm can be found in the config, where it is possible to
 fine-tune this process.
 Another way to have items tagged automatically is when an input PDF
 file contains a list of keywords in its metadata section (this only
 applies to PDF files). These keywords are then matched against the
 tags in the database. If they match, the item is tagged with them.
 ## Organization and Person