From 8c4f2e702ba0b0ffbfe654a2e53bae4ed9bc90cc Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Fri, 28 Aug 2020 22:17:49 +0200
Subject: [PATCH 01/10] Add classifier settings

---
 .../docspell/backend/ops/OCollective.scala    |   9 +-
 .../docspell/common/LearnClassifierArgs.scala |  35 +++
 .../joex/src/main/resources/reference.conf    |  44 ++++
 .../src/main/scala/docspell/joex/Config.scala |   9 +-
 .../src/main/resources/docspell-openapi.yml   |  27 +++
 .../restserver/routes/CollectiveRoutes.scala  |  31 ++-
 .../migration/mariadb/V1.9.1__classifier.sql  |   9 +
 .../postgresql/V1.9.1__classifier.sql         |  11 +
 .../store/records/RClassifierSetting.scala    | 106 ++++++++++
 .../docspell/store/records/RCollective.scala  |  56 ++++-
 modules/webapp/src/main/elm/App/View.elm      |   4 +-
 .../main/elm/Comp/ClassifierSettingsForm.elm  | 199 ++++++++++++++++++
 .../main/elm/Comp/CollectiveSettingsForm.elm  | 128 ++++++++---
 .../webapp/src/main/elm/Data/Validated.elm    |  14 ++
 .../main/elm/Page/CollectiveSettings/Data.elm |  10 +-
 .../elm/Page/CollectiveSettings/Update.elm    |   8 +-
 .../main/elm/Page/CollectiveSettings/View.elm |   5 +-
 17 files changed, 649 insertions(+), 56 deletions(-)
 create mode 100644 modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala
 create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql
 create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql
 create mode 100644 modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
 create mode 100644 modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm

diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
index e3835448..48934016 100644
--- a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
@@ -15,7 +15,9 @@ trait OCollective[F[_]] {
 
   def find(name: Ident): F[Option[RCollective]]
 
-  def updateSettings(collective: Ident, lang: OCollective.Settings): F[AddResult]
+  def updateSettings(collective: Ident, settings: OCollective.Settings): F[AddResult]
+
+  def findSettings(collective: Ident): F[Option[OCollective.Settings]]
 
   def listUser(collective: Ident): F[Vector[RUser]]
 
@@ -55,6 +57,8 @@ object OCollective {
 
   type Settings = RCollective.Settings
   val Settings = RCollective.Settings
+  type Classifier = RClassifierSetting.Classifier
+  val Classifier = RClassifierSetting.Classifier
 
   sealed trait PassChangeResult
   object PassChangeResult {
@@ -102,6 +106,9 @@ object OCollective {
           .attempt
           .map(AddResult.fromUpdate)
 
+      def findSettings(collective: Ident): F[Option[OCollective.Settings]] =
+        store.transact(RCollective.getSettings(collective))
+
       def listUser(collective: Ident): F[Vector[RUser]] =
         store.transact(RUser.findAll(collective, _.login))
 
diff --git a/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala b/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala
new file mode 100644
index 00000000..9cfa9395
--- /dev/null
+++ b/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala
@@ -0,0 +1,35 @@
+package docspell.common
+
+import docspell.common.syntax.all._
+
+import io.circe._
+import io.circe.generic.semiauto._
+
+/** Arguments to the classify-item task.
+  *
+  * This task is run periodically and learns from existing documents
+  * to create a model for predicting tags of new documents. The user
+  * must give a tag category as a subset of possible tags..
+  */
+case class LearnClassifierArgs(
+    collective: Ident
+) {
+
+  def makeSubject: String =
+    "Learn tags"
+
+}
+
+object LearnClassifierArgs {
+
+  val taskName = Ident.unsafe("learn-classifier")
+
+  implicit val jsonEncoder: Encoder[LearnClassifierArgs] =
+    deriveEncoder[LearnClassifierArgs]
+  implicit val jsonDecoder: Decoder[LearnClassifierArgs] =
+    deriveDecoder[LearnClassifierArgs]
+
+  def parse(str: String): Either[Throwable, LearnClassifierArgs] =
+    str.parseJsonAs[LearnClassifierArgs]
+
+}
diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index 115d2893..746f7bac 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -271,6 +271,50 @@ docspell.joex {
       # file will be kept until a check for a state change is done.
       file-cache-time = "1 minute"
     }
+
+    # Settings for doing document classification.
+    #
+    # This works by learning from existing documents. A collective can
+    # specify a tag category and the system will try to predict a tag
+    # from this category for new incoming documents.
+    #
+    # This requires a satstical model that is computed from all
+    # existing documents. This process is run periodically as
+    # configured by the collective. It may require a lot of memory,
+    # depending on the amount of data.
+    #
+    # It utilises this NLP library: https://nlp.stanford.edu/.
+    classification {
+      # Whether to enable classification globally. Each collective can
+      # decide to disable it. If it is disabled here, no collective
+      # can use classification.
+      enabled = true
+
+      # If concerned with memory consumption, this restricts the
+      # number of items to consider. More are better for training. A
+      # negative value or zero means no train on all items.
+      item-count = 0
+
+      # These settings are used to configure the classifier. If
+      # multiple are given, they are all tried and the "best" is
+      # chosen at the end. See
+      # https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
+      # for more info about these settings. The settings are almost
+      # identical to them, as they yielded best results with *my*
+      # dataset.
+      #
+      # Enclose regexps in triple quotes.
+      classifiers = [
+        { "useSplitWords" = "true"
+          "splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
+          "splitWordsIgnoreRegexp" = """\s+"""
+          "useSplitPrefixSuffixNGrams" = "true"
+          "maxNGramLeng" = "4"
+          "minNGramLeng" = "1"
+          "splitWordShape" = "chris4"
+        }
+      ]
+    }
   }
 
   # Configuration for converting files into PDFs.
diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala
index cb6bb9f3..a90ad61a 100644
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -57,7 +57,8 @@ object Config {
   case class TextAnalysis(
       maxLength: Int,
       workingDir: Path,
-      regexNer: RegexNer
+      regexNer: RegexNer,
+      classification: Classification
   ) {
 
     def textAnalysisConfig: TextAnalysisConfig =
@@ -68,4 +69,10 @@ object Config {
   }
 
   case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
+
+  case class Classification(
+      enabled: Boolean,
+      itemCount: Int,
+      classifiers: List[Map[String, String]]
+  )
 }
diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml
index 1a48eece..1a20db8d 100644
--- a/modules/restapi/src/main/resources/docspell-openapi.yml
+++ b/modules/restapi/src/main/resources/docspell-openapi.yml
@@ -3643,12 +3643,14 @@ components:
           description: DateTime
           type: integer
           format: date-time
+
     CollectiveSettings:
       description: |
         Settings for a collective.
       required:
         - language
         - integrationEnabled
+        - classifier
       properties:
         language:
           type: string
@@ -3658,6 +3660,31 @@ components:
           description: |
             Whether the collective has the integration endpoint
             enabled.
+        classifier:
+          $ref: "#/components/schemas/ClassifierSetting"
+
+    ClassifierSetting:
+      description: |
+        Settings for learning a document classifier.
+      required:
+        - enabled
+        - schedule
+        - itemCount
+      properties:
+        enabled:
+          type: boolean
+        category:
+          type: string
+        itemCount:
+          type: integer
+          format: int32
+          description: |
+            The max. number of items to learn from. The newest items
+            are considered.
+        schedule:
+          type: string
+          format: calevent
+
     SourceList:
       description: |
         A list of sources.
diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
index 8a84fa77..2aed289f 100644
--- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
@@ -10,6 +10,7 @@ import docspell.restapi.model._
 import docspell.restserver.conv.Conversions
 import docspell.restserver.http4s._
 
+import com.github.eikek.calev.CalEvent
 import org.http4s.HttpRoutes
 import org.http4s.circe.CirceEntityDecoder._
 import org.http4s.circe.CirceEntityEncoder._
@@ -37,7 +38,18 @@ object CollectiveRoutes {
       case req @ POST -> Root / "settings" =>
         for {
           settings <- req.as[CollectiveSettings]
-          sett = OCollective.Settings(settings.language, settings.integrationEnabled)
+          sett = OCollective.Settings(
+            settings.language,
+            settings.integrationEnabled,
+            Some(
+              OCollective.Classifier(
+                settings.classifier.enabled,
+                settings.classifier.schedule,
+                settings.classifier.itemCount,
+                settings.classifier.category
+              )
+            )
+          )
           res <-
             backend.collective
               .updateSettings(user.account.collective, sett)
@@ -46,8 +58,21 @@ object CollectiveRoutes {
 
       case GET -> Root / "settings" =>
         for {
-          collDb <- backend.collective.find(user.account.collective)
-          sett = collDb.map(c => CollectiveSettings(c.language, c.integrationEnabled))
+          settDb <- backend.collective.findSettings(user.account.collective)
+          sett = settDb.map(c =>
+            CollectiveSettings(
+              c.language,
+              c.integrationEnabled,
+              ClassifierSetting(
+                c.classifier.map(_.enabled).getOrElse(false),
+                c.classifier.flatMap(_.category),
+                c.classifier.map(_.itemCount).getOrElse(0),
+                c.classifier
+                  .map(_.schedule)
+                  .getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00"))
+              )
+            )
+          )
           resp <- sett.toResponse()
         } yield resp
 
diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql
new file mode 100644
index 00000000..fb1e85cd
--- /dev/null
+++ b/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql
@@ -0,0 +1,9 @@
+CREATE TABLE `classifier_setting` (
+  `cid` varchar(254) not null primary key,
+  `enabled` boolean not null,
+  `schedule` varchar(254) not null,
+  `category` varchar(254) not null,
+  `file_id` varchar(254),
+  `created` timestamp not null,
+  foreign key (`cid`) references `collective`(`cid`)
+);
diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql
new file mode 100644
index 00000000..5e81feea
--- /dev/null
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql
@@ -0,0 +1,11 @@
+CREATE TABLE "classifier_setting" (
+  "cid" varchar(254) not null primary key,
+  "enabled" boolean not null,
+  "schedule" varchar(254) not null,
+  "category" varchar(254) not null,
+  "item_count" int not null,
+  "file_id" varchar(254),
+  "created" timestamp not null,
+  foreign key ("cid") references "collective"("cid"),
+  foreign key ("file_id") references "filemeta"("id")
+);
diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
new file mode 100644
index 00000000..671a8d8f
--- /dev/null
+++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
@@ -0,0 +1,106 @@
+package docspell.store.records
+
+import cats.implicits._
+
+import docspell.common._
+import docspell.store.impl.Implicits._
+import docspell.store.impl._
+
+import com.github.eikek.calev._
+import doobie._
+import doobie.implicits._
+
+case class RClassifierSetting(
+    cid: Ident,
+    enabled: Boolean,
+    schedule: CalEvent,
+    category: String,
+    itemCount: Int,
+    fileId: Option[Ident],
+    created: Timestamp
+) {}
+
+object RClassifierSetting {
+
+  val table = fr"classifier_setting"
+
+  object Columns {
+    val cid       = Column("cid")
+    val enabled   = Column("enabled")
+    val schedule  = Column("schedule")
+    val category  = Column("category")
+    val itemCount = Column("item_count")
+    val fileId    = Column("file_id")
+    val created   = Column("created")
+    val all       = List(cid, enabled, schedule, category, itemCount, fileId, created)
+  }
+  import Columns._
+
+  def insert(v: RClassifierSetting): ConnectionIO[Int] = {
+    val sql =
+      insertRow(
+        table,
+        all,
+        fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}"
+      )
+    sql.update.run
+  }
+
+  def updateAll(v: RClassifierSetting): ConnectionIO[Int] = {
+    val sql = updateRow(
+      table,
+      cid.is(v.cid),
+      commas(
+        enabled.setTo(v.enabled),
+        schedule.setTo(v.schedule),
+        category.setTo(v.category),
+        itemCount.setTo(v.itemCount),
+        fileId.setTo(v.fileId)
+      )
+    )
+    sql.update.run
+  }
+
+  def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
+    for {
+      n1 <- updateRow(
+        table,
+        cid.is(v.cid),
+        commas(
+          enabled.setTo(v.enabled),
+          schedule.setTo(v.schedule),
+          itemCount.setTo(v.itemCount),
+          category.setTo(v.category)
+        )
+      ).update.run
+      n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
+    } yield n1 + n2
+
+  def findById(id: Ident): ConnectionIO[Option[RClassifierSetting]] = {
+    val sql = selectSimple(all, table, cid.is(id))
+    sql.query[RClassifierSetting].option
+  }
+
+  def delete(coll: Ident): ConnectionIO[Int] =
+    deleteFrom(table, cid.is(coll)).update.run
+
+  case class Classifier(
+      enabled: Boolean,
+      schedule: CalEvent,
+      itemCount: Int,
+      category: Option[String]
+  ) {
+
+    def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
+      RClassifierSetting(
+        coll,
+        enabled,
+        schedule,
+        category.getOrElse(""),
+        itemCount,
+        None,
+        created
+      )
+  }
+
+}
diff --git a/modules/store/src/main/scala/docspell/store/records/RCollective.scala b/modules/store/src/main/scala/docspell/store/records/RCollective.scala
index fa40e374..2487ed22 100644
--- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala
@@ -61,14 +61,47 @@ object RCollective {
     updateRow(table, id.is(cid), language.setTo(lang)).update.run
 
   def updateSettings(cid: Ident, settings: Settings): ConnectionIO[Int] =
-    updateRow(
-      table,
-      id.is(cid),
-      commas(
-        language.setTo(settings.language),
-        integration.setTo(settings.integrationEnabled)
-      )
-    ).update.run
+    for {
+      n1 <- updateRow(
+        table,
+        id.is(cid),
+        commas(
+          language.setTo(settings.language),
+          integration.setTo(settings.integrationEnabled)
+        )
+      ).update.run
+      cls <-
+        Timestamp
+          .current[ConnectionIO]
+          .map(now => settings.classifier.map(_.toRecord(cid, now)))
+      n2 <- cls match {
+        case Some(cr) =>
+          RClassifierSetting.updateSettings(cr)
+        case None =>
+          RClassifierSetting.delete(cid)
+      }
+    } yield n1 + n2
+
+  def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
+    val cId   = id.prefix("c")
+    val CS    = RClassifierSetting.Columns
+    val csCid = CS.cid.prefix("cs")
+
+    val cols = Seq(
+      language.prefix("c"),
+      integration.prefix("c"),
+      CS.enabled.prefix("cs"),
+      CS.schedule.prefix("cs"),
+      CS.itemCount.prefix("cs"),
+      CS.category.prefix("cs")
+    )
+    val from = table ++ fr"c LEFT JOIN" ++
+      RClassifierSetting.table ++ fr"cs ON" ++ csCid.is(cId)
+
+    selectSimple(cols, from, cId.is(coll))
+      .query[Settings]
+      .option
+  }
 
   def findById(cid: Ident): ConnectionIO[Option[RCollective]] = {
     val sql = selectSimple(all, table, id.is(cid))
@@ -112,5 +145,10 @@ object RCollective {
     selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option
   }
 
-  case class Settings(language: Language, integrationEnabled: Boolean)
+  case class Settings(
+      language: Language,
+      integrationEnabled: Boolean,
+      classifier: Option[RClassifierSetting.Classifier]
+  )
+
 }
diff --git a/modules/webapp/src/main/elm/App/View.elm b/modules/webapp/src/main/elm/App/View.elm
index 6906fd2f..346983e6 100644
--- a/modules/webapp/src/main/elm/App/View.elm
+++ b/modules/webapp/src/main/elm/App/View.elm
@@ -218,12 +218,12 @@ loginInfo model =
                         , menuEntry model
                             CollectiveSettingPage
                             [ i [ class "users circle icon" ] []
-                            , text "Collective Settings"
+                            , text "Collective Profile"
                             ]
                         , menuEntry model
                             UserSettingPage
                             [ i [ class "user circle icon" ] []
-                            , text "User Settings"
+                            , text "User Profile"
                             ]
                         , div [ class "divider" ] []
                         , menuEntry model
diff --git a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
new file mode 100644
index 00000000..ef6a7638
--- /dev/null
+++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
@@ -0,0 +1,199 @@
+module Comp.ClassifierSettingsForm exposing
+    ( Model
+    , Msg
+    , getSettings
+    , init
+    , update
+    , view
+    )
+
+import Api
+import Api.Model.ClassifierSetting exposing (ClassifierSetting)
+import Api.Model.TagList exposing (TagList)
+import Comp.CalEventInput
+import Comp.FixedDropdown
+import Comp.IntField
+import Data.CalEvent exposing (CalEvent)
+import Data.Flags exposing (Flags)
+import Data.Validated exposing (Validated(..))
+import Html exposing (..)
+import Html.Attributes exposing (..)
+import Html.Events exposing (onCheck)
+import Http
+import Util.Tag
+
+
+type alias Model =
+    { enabled : Bool
+    , categoryModel : Comp.FixedDropdown.Model String
+    , category : Maybe String
+    , scheduleModel : Comp.CalEventInput.Model
+    , schedule : Validated CalEvent
+    , itemCountModel : Comp.IntField.Model
+    , itemCount : Maybe Int
+    }
+
+
+type Msg
+    = GetTagsResp (Result Http.Error TagList)
+    | ScheduleMsg Comp.CalEventInput.Msg
+    | ToggleEnabled
+    | CategoryMsg (Comp.FixedDropdown.Msg String)
+    | ItemCountMsg Comp.IntField.Msg
+
+
+init : Flags -> ClassifierSetting -> ( Model, Cmd Msg )
+init flags sett =
+    let
+        newSchedule =
+            Data.CalEvent.fromEvent sett.schedule
+                |> Maybe.withDefault Data.CalEvent.everyMonth
+
+        ( cem, cec ) =
+            Comp.CalEventInput.init flags newSchedule
+    in
+    ( { enabled = sett.enabled
+      , categoryModel = Comp.FixedDropdown.initString []
+      , category = Nothing
+      , scheduleModel = cem
+      , schedule = Data.Validated.Unknown newSchedule
+      , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
+      , itemCount = Just sett.itemCount
+      }
+    , Cmd.batch
+        [ Api.getTags flags "" GetTagsResp
+        , Cmd.map ScheduleMsg cec
+        ]
+    )
+
+
+getSettings : Model -> Validated ClassifierSetting
+getSettings model =
+    Data.Validated.map
+        (\sch ->
+            { enabled = model.enabled
+            , category = model.category
+            , schedule =
+                Data.CalEvent.makeEvent sch
+            , itemCount = Maybe.withDefault 0 model.itemCount
+            }
+        )
+        model.schedule
+
+
+update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
+update flags msg model =
+    case msg of
+        GetTagsResp (Ok tl) ->
+            let
+                categories =
+                    Util.Tag.getCategories tl.items
+                        |> List.sort
+            in
+            ( { model
+                | categoryModel = Comp.FixedDropdown.initString categories
+                , category = List.head categories
+              }
+            , Cmd.none
+            )
+
+        GetTagsResp (Err _) ->
+            ( model, Cmd.none )
+
+        ScheduleMsg lmsg ->
+            let
+                ( cm, cc, ce ) =
+                    Comp.CalEventInput.update
+                        flags
+                        (Data.Validated.value model.schedule)
+                        lmsg
+                        model.scheduleModel
+            in
+            ( { model
+                | scheduleModel = cm
+                , schedule = ce
+              }
+            , Cmd.map ScheduleMsg cc
+            )
+
+        ToggleEnabled ->
+            ( { model | enabled = not model.enabled }
+            , Cmd.none
+            )
+
+        CategoryMsg lmsg ->
+            let
+                ( mm, ma ) =
+                    Comp.FixedDropdown.update lmsg model.categoryModel
+            in
+            ( { model
+                | categoryModel = mm
+                , category =
+                    if ma == Nothing then
+                        model.category
+
+                    else
+                        ma
+              }
+            , Cmd.none
+            )
+
+        ItemCountMsg lmsg ->
+            let
+                ( im, iv ) =
+                    Comp.IntField.update lmsg model.itemCountModel
+            in
+            ( { model
+                | itemCountModel = im
+                , itemCount = iv
+              }
+            , Cmd.none
+            )
+
+
+view : Model -> Html Msg
+view model =
+    div []
+        [ div
+            [ class "field"
+            ]
+            [ div [ class "ui checkbox" ]
+                [ input
+                    [ type_ "checkbox"
+                    , onCheck (\_ -> ToggleEnabled)
+                    , checked model.enabled
+                    ]
+                    []
+                , label [] [ text "Enable classification" ]
+                , span [ class "small-info" ]
+                    [ text "Disable document classification if not needed."
+                    ]
+                ]
+            ]
+        , div [ class "ui basic segment" ]
+            [ text "Document classification tries to predict a tag for new incoming documents. This "
+            , text "works by learning from existing documents in order to find common patterns within "
+            , text "the text. The more documents you have correctly tagged, the better. Learning is done "
+            , text "periodically based on a schedule and you need to specify a tag-group that should "
+            , text "be used for learning."
+            ]
+        , div [ class "field" ]
+            [ label [] [ text "Category" ]
+            , Html.map CategoryMsg
+                (Comp.FixedDropdown.viewString model.category
+                    model.categoryModel
+                )
+            ]
+        , Html.map ItemCountMsg
+            (Comp.IntField.viewWithInfo
+                "The maximum number of items to learn from, order by date newest first. Use 0 to mean all."
+                model.itemCount
+                "field"
+                model.itemCountModel
+            )
+        , div [ class "field" ]
+            [ label [] [ text "Schedule" ]
+            , Html.map ScheduleMsg
+                (Comp.CalEventInput.view "" (Data.Validated.value model.schedule) model.scheduleModel)
+            ]
+        ]
diff --git a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm
index 342473c1..87696d85 100644
--- a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm
+++ b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm
@@ -10,10 +10,12 @@ module Comp.CollectiveSettingsForm exposing
 import Api
 import Api.Model.BasicResult exposing (BasicResult)
 import Api.Model.CollectiveSettings exposing (CollectiveSettings)
+import Comp.ClassifierSettingsForm
 import Comp.Dropdown
 import Data.Flags exposing (Flags)
 import Data.Language exposing (Language)
 import Data.UiSettings exposing (UiSettings)
+import Data.Validated exposing (Validated)
 import Html exposing (..)
 import Html.Attributes exposing (..)
 import Html.Events exposing (onCheck, onClick, onInput)
@@ -27,44 +29,58 @@ type alias Model =
     , initSettings : CollectiveSettings
     , fullTextConfirmText : String
     , fullTextReIndexResult : Maybe BasicResult
+    , classifierModel : Comp.ClassifierSettingsForm.Model
     }
 
 
-init : CollectiveSettings -> Model
-init settings =
+init : Flags -> CollectiveSettings -> ( Model, Cmd Msg )
+init flags settings =
     let
         lang =
             Data.Language.fromString settings.language
                 |> Maybe.withDefault Data.Language.German
+
+        ( cm, cc ) =
+            Comp.ClassifierSettingsForm.init flags settings.classifier
     in
-    { langModel =
-        Comp.Dropdown.makeSingleList
-            { makeOption =
-                \l ->
-                    { value = Data.Language.toIso3 l
-                    , text = Data.Language.toName l
-                    , additional = ""
-                    }
-            , placeholder = ""
-            , options = Data.Language.all
-            , selected = Just lang
-            }
-    , intEnabled = settings.integrationEnabled
-    , initSettings = settings
-    , fullTextConfirmText = ""
-    , fullTextReIndexResult = Nothing
-    }
+    ( { langModel =
+            Comp.Dropdown.makeSingleList
+                { makeOption =
+                    \l ->
+                        { value = Data.Language.toIso3 l
+                        , text = Data.Language.toName l
+                        , additional = ""
+                        }
+                , placeholder = ""
+                , options = Data.Language.all
+                , selected = Just lang
+                }
+      , intEnabled = settings.integrationEnabled
+      , initSettings = settings
+      , fullTextConfirmText = ""
+      , fullTextReIndexResult = Nothing
+      , classifierModel = cm
+      }
+    , Cmd.map ClassifierSettingMsg cc
+    )
 
 
-getSettings : Model -> CollectiveSettings
+getSettings : Model -> Validated CollectiveSettings
 getSettings model =
-    CollectiveSettings
-        (Comp.Dropdown.getSelected model.langModel
-            |> List.head
-            |> Maybe.map Data.Language.toIso3
-            |> Maybe.withDefault model.initSettings.language
+    Data.Validated.map
+        (\cls ->
+            { language =
+                Comp.Dropdown.getSelected model.langModel
+                    |> List.head
+                    |> Maybe.map Data.Language.toIso3
+                    |> Maybe.withDefault model.initSettings.language
+            , integrationEnabled = model.intEnabled
+            , classifier = cls
+            }
+        )
+        (Comp.ClassifierSettingsForm.getSettings
+            model.classifierModel
         )
-        model.intEnabled
 
 
 type Msg
@@ -73,6 +89,8 @@ type Msg
     | SetFullTextConfirm String
     | TriggerReIndex
     | TriggerReIndexResult (Result Http.Error BasicResult)
+    | ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg
+    | SaveSettings
 
 
 update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings )
@@ -85,22 +103,15 @@ update flags msg model =
 
                 nextModel =
                     { model | langModel = m2 }
-
-                nextSettings =
-                    if Comp.Dropdown.isDropdownChangeMsg m then
-                        Just (getSettings nextModel)
-
-                    else
-                        Nothing
             in
-            ( nextModel, Cmd.map LangDropdownMsg c2, nextSettings )
+            ( nextModel, Cmd.map LangDropdownMsg c2, Nothing )
 
         ToggleIntegrationEndpoint ->
             let
                 nextModel =
                     { model | intEnabled = not model.intEnabled }
             in
-            ( nextModel, Cmd.none, Just (getSettings nextModel) )
+            ( nextModel, Cmd.none, Nothing )
 
         SetFullTextConfirm str ->
             ( { model | fullTextConfirmText = str }, Cmd.none, Nothing )
@@ -138,6 +149,26 @@ update flags msg model =
             , Nothing
             )
 
+        ClassifierSettingMsg lmsg ->
+            let
+                ( cm, cc ) =
+                    Comp.ClassifierSettingsForm.update flags lmsg model.classifierModel
+            in
+            ( { model
+                | classifierModel = cm
+              }
+            , Cmd.map ClassifierSettingMsg cc
+            , Nothing
+            )
+
+        SaveSettings ->
+            case getSettings model of
+                Data.Validated.Valid s ->
+                    ( model, Cmd.none, Just s )
+
+                _ ->
+                    ( model, Cmd.none, Nothing )
+
 
 view : Flags -> UiSettings -> Model -> Html Msg
 view flags settings model =
@@ -232,4 +263,31 @@ view flags settings model =
                     |> text
                 ]
             ]
+        , h3
+            [ classList
+                [ ( "ui dividing header", True )
+                , ( "invisible hidden", False )
+                ]
+            ]
+            [ text "Document Classifier"
+            ]
+        , div
+            [ classList
+                [ ( "field", True )
+                , ( "invisible hidden", False )
+                ]
+            ]
+            [ Html.map ClassifierSettingMsg
+                (Comp.ClassifierSettingsForm.view model.classifierModel)
+            ]
+        , div [ class "ui divider" ] []
+        , button
+            [ classList
+                [ ( "ui primary button", True )
+                , ( "disabled", getSettings model |> Data.Validated.isInvalid )
+                ]
+            , onClick SaveSettings
+            ]
+            [ text "Save"
+            ]
         ]
diff --git a/modules/webapp/src/main/elm/Data/Validated.elm b/modules/webapp/src/main/elm/Data/Validated.elm
index c56f98c6..40e0f97e 100644
--- a/modules/webapp/src/main/elm/Data/Validated.elm
+++ b/modules/webapp/src/main/elm/Data/Validated.elm
@@ -1,5 +1,6 @@
 module Data.Validated exposing
     ( Validated(..)
+    , isInvalid
     , map
     , map2
     , map3
@@ -14,6 +15,19 @@ type Validated a
     | Unknown a
 
 
+isInvalid : Validated a -> Bool
+isInvalid v =
+    case v of
+        Valid _ ->
+            False
+
+        Invalid _ _ ->
+            True
+
+        Unknown _ ->
+            False
+
+
 value : Validated a -> a
 value va =
     case va of
diff --git a/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm b/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm
index 1b1bd53b..b8dd6a2b 100644
--- a/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm
+++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm
@@ -30,15 +30,21 @@ init flags =
     let
         ( sm, sc ) =
             Comp.SourceManage.init flags
+
+        ( cm, cc ) =
+            Comp.CollectiveSettingsForm.init flags Api.Model.CollectiveSettings.empty
     in
     ( { currentTab = Just InsightsTab
       , sourceModel = sm
       , userModel = Comp.UserManage.emptyModel
-      , settingsModel = Comp.CollectiveSettingsForm.init Api.Model.CollectiveSettings.empty
+      , settingsModel = cm
       , insights = Api.Model.ItemInsights.empty
       , submitResult = Nothing
       }
-    , Cmd.map SourceMsg sc
+    , Cmd.batch
+        [ Cmd.map SourceMsg sc
+        , Cmd.map SettingsFormMsg cc
+        ]
     )
 
 
diff --git a/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm b/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm
index fa9ab433..7ad68e16 100644
--- a/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm
+++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm
@@ -77,7 +77,13 @@ update flags msg model =
             ( model, Cmd.none )
 
         CollectiveSettingsResp (Ok data) ->
-            ( { model | settingsModel = Comp.CollectiveSettingsForm.init data }, Cmd.none )
+            let
+                ( cm, cc ) =
+                    Comp.CollectiveSettingsForm.init flags data
+            in
+            ( { model | settingsModel = cm }
+            , Cmd.map SettingsFormMsg cc
+            )
 
         CollectiveSettingsResp (Err _) ->
             ( model, Cmd.none )
diff --git a/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm b/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm
index 513e2719..c46aacfb 100644
--- a/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm
+++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm
@@ -185,10 +185,11 @@ viewSettings : Flags -> UiSettings -> Model -> List (Html Msg)
 viewSettings flags settings model =
     [ h2 [ class "ui header" ]
         [ i [ class "cog icon" ] []
-        , text "Settings"
+        , text "Collective Settings"
         ]
     , div [ class "ui segment" ]
-        [ Html.map SettingsFormMsg (Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
+        [ Html.map SettingsFormMsg
+            (Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
         ]
     , div
         [ classList

From 0c97b4ef762f30a0dc77dacbf3f06dac56df0752 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Mon, 31 Aug 2020 22:35:27 +0200
Subject: [PATCH 02/10] Initial impl of a text classifier based on stanford-nlp

---
 .../docspell/analysis/TextAnalyser.scala      |  16 +-
 .../analysis/TextAnalysisConfig.scala         |   5 +-
 .../analysis/nlp/ClassifierModel.scala        |   5 +
 .../docspell/analysis/nlp/PipelineCache.scala |  12 +-
 .../docspell/analysis/nlp/Properties.scala    |   5 +-
 .../analysis/nlp/StanfordNerClassifier.scala  |   2 +-
 ...ttings.scala => StanfordNerSettings.scala} |   6 +-
 .../analysis/nlp/StanfordTextClassifier.scala | 149 ++++++++++++++++++
 .../analysis/nlp/TextClassifier.scala         |  25 +++
 .../analysis/nlp/TextClassifierConfig.scala   |  10 ++
 .../analysis/src/test/resources/test.ser.gz   | Bin 0 -> 1682 bytes
 .../nlp/StanfordTextClassifierSuite.scala     |  76 +++++++++
 .../joex/src/main/resources/reference.conf    |   2 +-
 .../src/main/scala/docspell/joex/Config.scala |  13 +-
 .../joex/learn/LearnClassifierTask.scala      |  64 ++++++++
 .../docspell/joex/process/TextAnalysis.scala  |   4 +-
 16 files changed, 376 insertions(+), 18 deletions(-)
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala
 rename modules/analysis/src/main/scala/docspell/analysis/nlp/{StanfordSettings.scala => StanfordNerSettings.scala} (88%)
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala
 create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala
 create mode 100644 modules/analysis/src/test/resources/test.ser.gz
 create mode 100644 modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala
 create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala

diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
index 75d07eef..44f7203b 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala
@@ -7,18 +7,21 @@ import docspell.analysis.contact.Contact
 import docspell.analysis.date.DateFind
 import docspell.analysis.nlp.PipelineCache
 import docspell.analysis.nlp.StanfordNerClassifier
-import docspell.analysis.nlp.StanfordSettings
+import docspell.analysis.nlp.StanfordNerSettings
+import docspell.analysis.nlp.StanfordTextClassifier
+import docspell.analysis.nlp.TextClassifier
 import docspell.common._
 
 trait TextAnalyser[F[_]] {
 
   def annotate(
       logger: Logger[F],
-      settings: StanfordSettings,
+      settings: StanfordNerSettings,
       cacheKey: Ident,
       text: String
   ): F[TextAnalyser.Result]
 
+  def classifier(blocker: Blocker)(implicit CS: ContextShift[F]): TextClassifier[F]
 }
 object TextAnalyser {
 
@@ -35,7 +38,7 @@ object TextAnalyser {
         new TextAnalyser[F] {
           def annotate(
               logger: Logger[F],
-              settings: StanfordSettings,
+              settings: StanfordNerSettings,
               cacheKey: Ident,
               text: String
           ): F[TextAnalyser.Result] =
@@ -48,6 +51,11 @@ object TextAnalyser {
               spans = NerLabelSpan.build(list)
             } yield Result(spans ++ list, dates)
 
+          def classifier(blocker: Blocker)(implicit
+              CS: ContextShift[F]
+          ): TextClassifier[F] =
+            new StanfordTextClassifier[F](cfg.classifier, blocker)
+
           private def textLimit(logger: Logger[F], text: String): F[String] =
             if (text.length <= cfg.maxLength) text.pure[F]
             else
@@ -56,7 +64,7 @@ object TextAnalyser {
                   s" Analysing only first ${cfg.maxLength} characters."
               ) *> text.take(cfg.maxLength).pure[F]
 
-          private def stanfordNer(key: Ident, settings: StanfordSettings, text: String)
+          private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String)
               : F[Vector[NerLabel]] =
             StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text)
 
diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
index 577f6753..596a6247 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala
@@ -1,5 +1,8 @@
 package docspell.analysis
 
+import docspell.analysis.nlp.TextClassifierConfig
+
 case class TextAnalysisConfig(
-    maxLength: Int
+    maxLength: Int,
+    classifier: TextClassifierConfig
 )
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala
new file mode 100644
index 00000000..82f9f9cc
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala
@@ -0,0 +1,5 @@
+package docspell.analysis.nlp
+
+import java.nio.file.Path
+
+case class ClassifierModel(model: Path)
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
index 9787563f..88e13ee3 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala
@@ -19,7 +19,7 @@ import org.log4s.getLogger
   */
 trait PipelineCache[F[_]] {
 
-  def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP]
+  def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP]
 
 }
 
@@ -28,7 +28,7 @@ object PipelineCache {
 
   def none[F[_]: Applicative]: PipelineCache[F] =
     new PipelineCache[F] {
-      def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+      def obtain(ignored: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
         makeClassifier(settings).pure[F]
     }
 
@@ -38,7 +38,7 @@ object PipelineCache {
   final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]])
       extends PipelineCache[F] {
 
-    def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] =
+    def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] =
       for {
         id  <- makeSettingsId(settings)
         nlp <- data.modify(cache => getOrCreate(key, id, cache, settings))
@@ -48,7 +48,7 @@ object PipelineCache {
         key: String,
         id: String,
         cache: Map[String, Entry],
-        settings: StanfordSettings
+        settings: StanfordNerSettings
     ): (Map[String, Entry], StanfordCoreNLP) =
       cache.get(key) match {
         case Some(entry) =>
@@ -68,7 +68,7 @@ object PipelineCache {
           (cache.updated(key, e), nlp)
       }
 
-    private def makeSettingsId(settings: StanfordSettings): F[String] = {
+    private def makeSettingsId(settings: StanfordNerSettings): F[String] = {
       val base = settings.copy(regexNer = None).toString
       val size: F[Long] =
         settings.regexNer match {
@@ -81,7 +81,7 @@ object PipelineCache {
     }
 
   }
-  private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = {
+  private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = {
     logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...")
     new StanfordCoreNLP(Properties.forSettings(settings))
   }
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
index 314f04fb..46a614d1 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@@ -7,6 +7,9 @@ import docspell.common._
 
 object Properties {
 
+  def fromMap(m: Map[String, String]): JProps =
+    apply(m.toSeq: _*)
+
   def apply(ps: (String, String)*): JProps = {
     val p = new JProps()
     for ((k, v) <- ps)
@@ -14,7 +17,7 @@ object Properties {
     p
   }
 
-  def forSettings(settings: StanfordSettings): JProps = {
+  def forSettings(settings: StanfordNerSettings): JProps = {
     val regexNerFile = settings.regexNer
       .map(p => p.normalize().toAbsolutePath().toString())
     settings.lang match {
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
index 424396e5..383a07ea 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@@ -25,7 +25,7 @@ object StanfordNerClassifier {
   def nerAnnotate[F[_]: Applicative](
       cacheKey: String,
       cache: PipelineCache[F]
-  )(settings: StanfordSettings, text: String): F[Vector[NerLabel]] =
+  )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] =
     cache
       .obtain(cacheKey, settings)
       .map(crf => runClassifier(crf, text))
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
similarity index 88%
rename from modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
rename to modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
index c2f6f98c..06136a18 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala
@@ -19,4 +19,8 @@ import docspell.common._
   * as a last step to tag untagged tokens using the provided list of
   * regexps.
   */
-case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path])
+case class StanfordNerSettings(
+    lang: Language,
+    highRecall: Boolean,
+    regexNer: Option[Path]
+)
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala
new file mode 100644
index 00000000..3da3b5ba
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala
@@ -0,0 +1,149 @@
+package docspell.analysis.nlp
+
+import java.nio.file.Path
+
+import cats.effect._
+import cats.effect.concurrent.Ref
+import cats.implicits._
+import fs2.Stream
+
+import docspell.analysis.nlp.TextClassifier._
+import docspell.common._
+
+import edu.stanford.nlp.classify.ColumnDataClassifier
+
+final class StanfordTextClassifier[F[_]: Sync: ContextShift](
+    cfg: TextClassifierConfig,
+    blocker: Blocker
+) extends TextClassifier[F] {
+
+  def trainClassifier[A](
+      logger: Logger[F],
+      data: Stream[F, Data]
+  )(handler: TextClassifier.Handler[F, A]): F[A] =
+    File
+      .withTempDir(cfg.workingDir, "trainclassifier")
+      .use { dir =>
+        for {
+          rawData   <- writeDataFile(blocker, dir, data)
+          _         <- logger.debug(s"Learning from ${rawData.count} items.")
+          trainData <- splitData(logger, rawData)
+          scores    <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
+          sorted = scores.sortBy(-_.score)
+          res <- handler(sorted.head.model)
+        } yield res
+      }
+
+  def classify(
+      logger: Logger[F],
+      model: ClassifierModel,
+      text: String
+  ): F[Option[String]] =
+    Sync[F].delay {
+      val cls = ColumnDataClassifier.getClassifier(
+        model.model.normalize().toAbsolutePath().toString()
+      )
+      val cat = cls.classOf(cls.makeDatumFromLine(normalisedText(text)))
+      Option(cat)
+    }
+
+  // --- helpers
+
+  def train(
+      logger: Logger[F],
+      in: TrainData,
+      props: Map[String, String]
+  ): F[TrainResult] =
+    for {
+      _ <- logger.debug(s"Training classifier from $props")
+      res <- Sync[F].delay {
+        val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props)))
+        cdc.trainClassifier(in.train.toString())
+        val score = cdc.testClassifier(in.test.toString())
+        TrainResult(score.first(), ClassifierModel(in.modelFile))
+      }
+      _ <- logger.debug(s"Trained with result $res")
+    } yield res
+
+  def splitData(logger: Logger[F], in: RawData): F[TrainData] = {
+    val nTest = (in.count * 0.25).toLong
+
+    val td =
+      TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt"))
+
+    val fileLines =
+      fs2.io.file
+        .readAll(in.file, blocker, 4096)
+        .through(fs2.text.utf8Decode)
+        .through(fs2.text.lines)
+
+    for {
+      _ <- logger.debug(
+        s"Splitting raw data into test/train data. Testing with $nTest entries"
+      )
+      _ <-
+        fileLines
+          .take(nTest)
+          .intersperse("\n")
+          .through(fs2.text.utf8Encode)
+          .through(fs2.io.file.writeAll(td.test, blocker))
+          .compile
+          .drain
+      _ <-
+        fileLines
+          .drop(nTest)
+          .intersperse("\n")
+          .through(fs2.text.utf8Encode)
+          .through(fs2.io.file.writeAll(td.train, blocker))
+          .compile
+          .drain
+    } yield td
+  }
+
+  def writeDataFile(blocker: Blocker, dir: Path, data: Stream[F, Data]): F[RawData] = {
+    val target = dir.resolve("rawdata")
+    for {
+      counter <- Ref.of[F, Long](0L)
+      _ <-
+        data
+          .map(d => s"${d.cls}\t${d.ref}\t${normalisedText(d.text)}")
+          .evalTap(_ => counter.update(_ + 1))
+          .intersperse("\n")
+          .through(fs2.text.utf8Encode)
+          .through(fs2.io.file.writeAll(target, blocker))
+          .compile
+          .drain
+      lines <- counter.get
+    } yield RawData(lines, target)
+
+  }
+
+  def normalisedText(text: String): String =
+    text.replaceAll("[\n\t]+", " ")
+
+  def amendProps(
+      trainData: TrainData,
+      props: Map[String, String]
+  ): Map[String, String] =
+    prepend("2", props) ++ Map(
+      "trainFile"   -> trainData.train.normalize().toAbsolutePath().toString(),
+      "testFile"    -> trainData.test.normalize().toAbsolutePath().toString(),
+      "serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString()
+    ).toList
+
+  case class RawData(count: Long, file: Path)
+  case class TrainData(train: Path, test: Path) {
+    val modelFile = train.resolveSibling("model.ser.gz")
+  }
+
+  case class TrainResult(score: Double, model: ClassifierModel)
+
+  def prepend(pre: String, data: Map[String, String]): Map[String, String] =
+    data.toList
+      .map({
+        case (k, v) =>
+          if (k.startsWith(pre)) (k, v)
+          else (pre + k, v)
+      })
+      .toMap
+}
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala
new file mode 100644
index 00000000..f2927d0c
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala
@@ -0,0 +1,25 @@
+package docspell.analysis.nlp
+
+import cats.data.Kleisli
+import fs2.Stream
+
+import docspell.analysis.nlp.TextClassifier.Data
+import docspell.common._
+
+trait TextClassifier[F[_]] {
+
+  def trainClassifier[A](logger: Logger[F], data: Stream[F, Data])(
+      handler: TextClassifier.Handler[F, A]
+  ): F[A]
+
+  def classify(logger: Logger[F], model: ClassifierModel, text: String): F[Option[String]]
+
+}
+
+object TextClassifier {
+
+  type Handler[F[_], A] = Kleisli[F, ClassifierModel, A]
+
+  case class Data(cls: String, ref: String, text: String)
+
+}
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala
new file mode 100644
index 00000000..e3baac46
--- /dev/null
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala
@@ -0,0 +1,10 @@
+package docspell.analysis.nlp
+
+import java.nio.file.Path
+
+import cats.data.NonEmptyList
+
+case class TextClassifierConfig(
+    workingDir: Path,
+    classifierConfigs: NonEmptyList[Map[String, String]]
+)
diff --git a/modules/analysis/src/test/resources/test.ser.gz b/modules/analysis/src/test/resources/test.ser.gz
new file mode 100644
index 0000000000000000000000000000000000000000..b6d0956ba0f2100bc670502e77717ac428688a9d
GIT binary patch
literal 1682
zcmV;D25tEtiwFP!000000G(G|h#W-_uG{2t|NlSvAr}RsVnTKaY7`E<+}*m#hRNlU
zI}_OzN#9KGZs%ruCfz-^dl8hNMkVn<(FY{~1q~4s6<<VOB!~ojQ}HDTY6KsQN(8|N
z!LxdPc6RSBMjY7P>H7Jqx~jUm_TsP5ECp<%mY0-@@PZ?(B)8q9X%i_~VKHeiPKlTd
zmW&GTj~ACtzusF(Kywb(F|MfK26PrEQJ$b#ZqcfO5d#5J5nu`dQ=VdW%4d%n&~B4C
zYWu6>@FX<lp<R^(l_kfvq=G~~orjhRWyO-h<(hmxU3CHdylsV_8aBC?$dV7X@A{eS
zOT&MCn}D_Gob8zN)H0xjX&yzS9^)8UAb~IYiHn}JICICSsjc+Z@eC#vGTS6A6Sw3D
zs^IoIw&`%w6M}M;nu#Pv0}uPORO}TRX>PfC`Lp4pU#@{f1~={yD??1>h%B_#_FQ$x
zc7y*^TJft_zf}=!AVwP(f-Gvkswcl1dF={0w+5QBuvW4Y6fJAbwUO4ASS$84S@ZPk
zJsYq5{lfX*63}EIz#=yl+<A-LZJj|e+@h|Pe@EXs<?sSwLMvQ7dwAEI!_WVeK)(ds
zaSKJQ(R;>w`ZD=wDA2fnRSlh1nH-;!N?;%ldeozq)-4H#V>CqJkqeXF*r6X2IzY`>
zEs7%6uK$2_pu)+VP3yt#{qFsbFO{8lv_m?~O;Q(ZGx$I4Vb(fpSxYEr5!9s2RkdQU
z2`?tIcu<N|^j~}9tv^pb_h8`L0<k^qTW>5k<(22CICb&Wk&aisKV7q`bP+E&%%r~I
zjX|pQYp1sA=RTpAoAuX*fYt4P_i24k=tK3;`7?v>nz=0zYyy@&D5ekFl}jB<R&nX!
zNKVrq$Qw6r5|fVYm3f*d5;uyi8JiU4=(C@{eBseS`av8(0r&kMq;p@KJ@xzJzqBVH
zpM`Y?r*!4xA)jMJQY#HTbBuFp9o8Y0Q)DU!9VRhLbZU<vWtoLRQ#ev0n^{CLhi7!F
z$wH6Cq-&E!YK4}eg}Rd24$r|BT4idyfE{jD5XKoVX8bP4PWT8p=nA`D#t~(rvM>~`
z_oHyxE6;eY>j;&G;gEjB?`X5psn0=cf!UOkGR;D75h>$bR;UO($wGHgIG&4U146Cc
z=Ae_wLzLM5W)Y_4m?srRFAdH1PC<)lX!U=~q*{Ar2-sz_GE>1OsLVm1?G$HRo2et1
zhe^ld3ZapP+vO5-Q_ev*E}5z$wAl{2XkyTF&{KPlqTGol{da@%B5n$o8oQ3viJmk_
zFuqJ;&=t7PIQ`QESveR9$c<S{Q<F*jMbpj5*Il(AeVxak&6wC!qR91Pz$>%PJmq2X
z_Cbd%d4&R=cB(@-5U!$3>k7nhN`^tyTvp`hj7iXgAUmkjEgNE&ts~5oh>HQ*R3peE
zOqHf*Cqf^n!0M65MI6~QUI&g%acQDryC><mMO<a<%)y{u*mf;b)7V}yd&A4fvx2#{
zPHP)7CKxAT5mSWkekjAN=ioM^7*XDQ<qX0pgUG;9^m>KFGOH_@$Y@5!)I?xCD2E&j
z;rq0p1y-HG=(1`=*Rltv0<cwG=#ChX#-@m{8eG3u=SoEE^Hw_F<u&@OeVtKQS4BKr
z`rRuBw&UZ4QUeQA%sG-Ca21}PpnmZ7u0(k@pf~73dly|Qu$yq47w%p;0_m$rrh>vY
z{gR<#G!I=t=hDTmO8)edHDIuT7z!&$-VAlN;-p!ba(x(x9fJ=AIE|;=zZdSwLuY-@
z)U<>F1NF2JN3*sWFl5%wWH7=kj*B8X+#d`N`u<7<(L}@Trq-P-5#euXG7sG`1*<dg
z0qr&PBty@IK5M(7dk?YVJ`hp|Tp|=Bwx8!|qI5@8u(57{!FEThsgQ+ZTz|dBA<c9{
zte$19CM0+H<jALxI2iVoaTA9JVlMhxA#Z(r&a*|1eJ3=o**gsAi`k8gy8cjW42=@4
z3g^}g-V%`)(Rr><L&)1uiOPQhBW>aNZ^T-^g5MB~4$VGHHKM<-b^d+b1E1aqr-LPY
z^bJ1t%!wcNyf^s7^64QKiJsk0?OsJFlnZ6`5YmJnOVgVg0mg3BE8u$5u4~sfPwuEg
zYH9>^!^a<Zvim<dde_3wAHKbO;Ask_a28n~&xL;7B*U0xir>uhY@#ax3A|ehdIG_J
c3vL|H?ZIe-e@`Bh+e5s60M;VBa!(Ba0MQ0Ep#T5?

literal 0
HcmV?d00001

diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala
new file mode 100644
index 00000000..b9596923
--- /dev/null
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala
@@ -0,0 +1,76 @@
+package docspell.analysis.nlp
+
+import minitest._
+import cats.effect._
+import scala.concurrent.ExecutionContext
+import java.nio.file.Paths
+import cats.data.NonEmptyList
+import docspell.common._
+import fs2.Stream
+import cats.data.Kleisli
+import TextClassifier.Data
+
+object StanfordTextClassifierSuite extends SimpleTestSuite {
+  val logger = Logger.log4s[IO](org.log4s.getLogger)
+
+  implicit val CS = IO.contextShift(ExecutionContext.global)
+
+  test("learn from data") {
+    val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
+
+    val data =
+      Stream
+        .emit(Data("invoice", "n", "this is your invoice   total $421"))
+        .repeat
+        .take(10)
+        .zip(
+          Stream
+            .emit(Data("receipt", "n", "shopping receipt cheese cake bar"))
+            .repeat
+            .take(10)
+        )
+        .flatMap({
+          case (a, b) =>
+            Stream.emits(Seq(a, b))
+        })
+        .covary[IO]
+
+    val modelExists =
+      Blocker[IO].use { blocker =>
+        val classifier = new StanfordTextClassifier[IO](cfg, blocker)
+        classifier.trainClassifier[Boolean](logger, data)(
+          Kleisli(result => File.existsNonEmpty[IO](result.model))
+        )
+      }
+    assertEquals(modelExists.unsafeRunSync(), true)
+  }
+
+  test("run classifier") {
+    val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map()))
+    val things = for {
+      dir     <- File.withTempDir[IO](Paths.get("target"), "testcls")
+      blocker <- Blocker[IO]
+    } yield (dir, blocker)
+
+    things
+      .use {
+        case (dir, blocker) =>
+          val classifier = new StanfordTextClassifier[IO](cfg, blocker)
+
+          val modelFile = dir.resolve("test.ser.gz")
+          for {
+            _ <-
+              LenientUri
+                .fromJava(getClass.getResource("/test.ser.gz"))
+                .readURL[IO](4096, blocker)
+                .through(fs2.io.file.writeAll(modelFile, blocker))
+                .compile
+                .drain
+            model = ClassifierModel(modelFile)
+            cat <- classifier.classify(logger, model, "there is receipt always")
+            _ = assertEquals(cat, Some("receipt"))
+          } yield ()
+      }
+      .unsafeRunSync()
+  }
+}
diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index 746f7bac..e09bfd3b 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -298,7 +298,7 @@ docspell.joex {
       # These settings are used to configure the classifier. If
       # multiple are given, they are all tried and the "best" is
       # chosen at the end. See
-      # https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
+      # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
       # for more info about these settings. The settings are almost
       # identical to them, as they yielded best results with *my*
       # dataset.
diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala
index a90ad61a..cbbb4a33 100644
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -2,7 +2,10 @@ package docspell.joex
 
 import java.nio.file.Path
 
+import cats.data.NonEmptyList
+
 import docspell.analysis.TextAnalysisConfig
+import docspell.analysis.nlp.TextClassifierConfig
 import docspell.backend.Config.Files
 import docspell.common._
 import docspell.convert.ConvertConfig
@@ -62,7 +65,15 @@ object Config {
   ) {
 
     def textAnalysisConfig: TextAnalysisConfig =
-      TextAnalysisConfig(maxLength)
+      TextAnalysisConfig(
+        maxLength,
+        TextClassifierConfig(
+          workingDir,
+          NonEmptyList
+            .fromList(classification.classifiers)
+            .getOrElse(NonEmptyList.of(Map.empty))
+        )
+      )
 
     def regexNerFileConfig: RegexNerFile.Config =
       RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime)
diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
new file mode 100644
index 00000000..a161417a
--- /dev/null
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@@ -0,0 +1,64 @@
+package docspell.joex.learn
+
+import cats.data.Kleisli
+import cats.data.OptionT
+import cats.effect._
+import fs2.Stream
+
+import docspell.analysis.TextAnalyser
+import docspell.analysis.nlp.ClassifierModel
+import docspell.analysis.nlp.TextClassifier.Data
+import docspell.backend.ops.OCollective
+import docspell.common._
+import docspell.joex.Config
+import docspell.joex.scheduler._
+
+object LearnClassifierTask {
+
+  type Args = LearnClassifierArgs
+
+  def apply[F[_]: Sync: ContextShift](
+      cfg: Config.TextAnalysis,
+      blocker: Blocker,
+      analyser: TextAnalyser[F]
+  ): Task[F, Args, Unit] =
+    Task { ctx =>
+      (for {
+        sett <- findActiveSettings[F](ctx.args.collective, cfg)
+        data = selectItems(
+          ctx,
+          math.min(cfg.classification.itemCount, sett.itemCount),
+          sett.category.getOrElse("")
+        )
+        _ <- OptionT.liftF(
+          analyser
+            .classifier(blocker)
+            .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx)))
+        )
+      } yield ())
+        .getOrElseF(logInactiveWarning(ctx.logger))
+    }
+
+  private def handleModel[F[_]](
+      ctx: Context[F, Args]
+  )(trainedModel: ClassifierModel): F[Unit] =
+    ???
+
+  private def selectItems[F[_]](
+      ctx: Context[F, Args],
+      max: Int,
+      category: String
+  ): Stream[F, Data] =
+    ???
+
+  private def findActiveSettings[F[_]: Sync](
+      coll: Ident,
+      cfg: Config.TextAnalysis
+  ): OptionT[F, OCollective.Classifier] =
+    ???
+
+  private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
+    logger.warn(
+      "Classification is disabled. Check joex config and the collective settings."
+    )
+}
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index abbb6870..92975a70 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -4,7 +4,7 @@ import cats.effect._
 import cats.implicits._
 
 import docspell.analysis.TextAnalyser
-import docspell.analysis.nlp.StanfordSettings
+import docspell.analysis.nlp.StanfordNerSettings
 import docspell.common._
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.process.ItemData.AttachmentDates
@@ -42,7 +42,7 @@ object TextAnalysis {
       analyser: TextAnalyser[F],
       nerFile: RegexNerFile[F]
   )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
-    val settings = StanfordSettings(ctx.args.meta.language, false, None)
+    val settings = StanfordNerSettings(ctx.args.meta.language, false, None)
     for {
       customNer <- nerFile.makeFile(ctx.args.meta.collective)
       sett = settings.copy(regexNer = customNer)

From 68bb65572b361592dabc686a50c2a949252e84a5 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 1 Sep 2020 00:21:19 +0200
Subject: [PATCH 03/10] Integrate learn-classifier task into the app

---
 .../scala/docspell/backend/BackendApp.scala   |  4 +--
 .../docspell/backend/ops/OCollective.scala    | 27 ++++++++++++++++++-
 .../scala/docspell/joex/JoexAppImpl.scala     |  8 ++++++
 .../joex/learn/LearnClassifierTask.scala      | 16 ++++++++---
 .../store/records/RClassifierSetting.scala    |  4 +++
 5 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
index 6ff3c73e..a9572832 100644
--- a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
+++ b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
@@ -52,12 +52,12 @@ object BackendApp {
       queue          <- JobQueue(store)
       loginImpl      <- Login[F](store)
       signupImpl     <- OSignup[F](store)
-      collImpl       <- OCollective[F](store)
+      joexImpl       <- OJoex(JoexClient(httpClient), store)
+      collImpl       <- OCollective[F](store, utStore, joexImpl)
       sourceImpl     <- OSource[F](store)
       tagImpl        <- OTag[F](store)
       equipImpl      <- OEquipment[F](store)
       orgImpl        <- OOrganization(store)
-      joexImpl       <- OJoex(JoexClient(httpClient), store)
       uploadImpl     <- OUpload(store, queue, cfg.files, joexImpl)
       nodeImpl       <- ONode(store)
       jobImpl        <- OJob(store, joexImpl)
diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
index 48934016..955a4649 100644
--- a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
@@ -9,8 +9,12 @@ import docspell.backend.ops.OCollective._
 import docspell.common._
 import docspell.store.queries.QCollective
 import docspell.store.records._
+import docspell.store.usertask.UserTask
+import docspell.store.usertask.UserTaskStore
 import docspell.store.{AddResult, Store}
 
+import com.github.eikek.calev.CalEvent
+
 trait OCollective[F[_]] {
 
   def find(name: Ident): F[Option[RCollective]]
@@ -95,7 +99,11 @@ object OCollective {
     }
   }
 
-  def apply[F[_]: Effect](store: Store[F]): Resource[F, OCollective[F]] =
+  def apply[F[_]: Effect](
+      store: Store[F],
+      uts: UserTaskStore[F],
+      joex: OJoex[F]
+  ): Resource[F, OCollective[F]] =
     Resource.pure[F, OCollective[F]](new OCollective[F] {
       def find(name: Ident): F[Option[RCollective]] =
         store.transact(RCollective.findById(name))
@@ -105,6 +113,23 @@ object OCollective {
           .transact(RCollective.updateSettings(collective, sett))
           .attempt
           .map(AddResult.fromUpdate)
+          .flatMap(res => updateLearnClassifierTask(collective, sett) *> res.pure[F])
+
+      def updateLearnClassifierTask(coll: Ident, sett: Settings) =
+        for {
+          id <- Ident.randomId[F]
+          on    = sett.classifier.map(_.enabled).getOrElse(false)
+          timer = sett.classifier.map(_.schedule).getOrElse(CalEvent.unsafe(""))
+          ut = UserTask(
+            id,
+            LearnClassifierArgs.taskName,
+            on,
+            timer,
+            LearnClassifierArgs(coll)
+          )
+          _ <- uts.updateOneTask(AccountId(coll, LearnClassifierArgs.taskName), ut)
+          _ <- joex.notifyAllNodes
+        } yield ()
 
       def findSettings(collective: Ident): F[Option[OCollective.Settings]] =
         store.transact(RCollective.getSettings(collective))
diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
index 2fa94c25..7c3f57fc 100644
--- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
+++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala
@@ -14,6 +14,7 @@ import docspell.ftssolr.SolrFtsClient
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.fts.{MigrationTask, ReIndexTask}
 import docspell.joex.hk._
+import docspell.joex.learn.LearnClassifierTask
 import docspell.joex.notify._
 import docspell.joex.pdfconv.ConvertAllPdfTask
 import docspell.joex.pdfconv.PdfConvTask
@@ -159,6 +160,13 @@ object JoexAppImpl {
             ConvertAllPdfTask.onCancel[F]
           )
         )
+        .withTask(
+          JobTask.json(
+            LearnClassifierArgs.taskName,
+            LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser),
+            LearnClassifierTask.onCancel[F]
+          )
+        )
         .resource
       psch <- PeriodicScheduler.create(
         cfg.periodicScheduler,
diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
index a161417a..6c11fecf 100644
--- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@@ -12,11 +12,15 @@ import docspell.backend.ops.OCollective
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.scheduler._
+import docspell.store.records.RClassifierSetting
 
 object LearnClassifierTask {
 
   type Args = LearnClassifierArgs
 
+  def onCancel[F[_]: Sync]: Task[F, Args, Unit] =
+    Task.log(_.warn("Cancelling learn-classifier task"))
+
   def apply[F[_]: Sync: ContextShift](
       cfg: Config.TextAnalysis,
       blocker: Blocker,
@@ -24,7 +28,7 @@ object LearnClassifierTask {
   ): Task[F, Args, Unit] =
     Task { ctx =>
       (for {
-        sett <- findActiveSettings[F](ctx.args.collective, cfg)
+        sett <- findActiveSettings[F](ctx, cfg)
         data = selectItems(
           ctx,
           math.min(cfg.classification.itemCount, sett.itemCount),
@@ -52,10 +56,16 @@ object LearnClassifierTask {
     ???
 
   private def findActiveSettings[F[_]: Sync](
-      coll: Ident,
+      ctx: Context[F, Args],
       cfg: Config.TextAnalysis
   ): OptionT[F, OCollective.Classifier] =
-    ???
+    if (cfg.classification.enabled)
+      OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
+        .filter(_.enabled)
+        .filter(_.category.nonEmpty)
+        .map(OCollective.Classifier.fromRecord)
+    else
+      OptionT.none
 
   private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] =
     logger.warn(
diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
index 671a8d8f..c15f870c 100644
--- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
@@ -102,5 +102,9 @@ object RClassifierSetting {
         created
       )
   }
+  object Classifier {
+    def fromRecord(r: RClassifierSetting): Classifier =
+      Classifier(r.enabled, r.schedule, r.itemCount, r.category.some)
+  }
 
 }

From 316b490008da457db53cdc34596bdedd49551471 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 1 Sep 2020 07:50:21 +0200
Subject: [PATCH 04/10] Implement learning a text classifier from collective
 data

---
 .../analysis/nlp/StanfordTextClassifier.scala | 18 +++--
 .../joex/learn/LearnClassifierTask.scala      | 52 +++++++++++---
 .../docspell/store/impl/DoobieSyntax.scala    |  4 +-
 .../scala/docspell/store/queries/QItem.scala  | 71 +++++++++++++++++++
 .../store/records/RClassifierSetting.scala    |  3 +
 5 files changed, 130 insertions(+), 18 deletions(-)

diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala
index 3da3b5ba..d8846fc4 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala
@@ -26,7 +26,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
       .use { dir =>
         for {
           rawData   <- writeDataFile(blocker, dir, data)
-          _         <- logger.debug(s"Learning from ${rawData.count} items.")
+          _         <- logger.info(s"Learning from ${rawData.count} items.")
           trainData <- splitData(logger, rawData)
           scores    <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m))
           sorted = scores.sortBy(-_.score)
@@ -43,7 +43,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
       val cls = ColumnDataClassifier.getClassifier(
         model.model.normalize().toAbsolutePath().toString()
       )
-      val cat = cls.classOf(cls.makeDatumFromLine(normalisedText(text)))
+      val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text)))
       Option(cat)
     }
 
@@ -66,7 +66,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
     } yield res
 
   def splitData(logger: Logger[F], in: RawData): F[TrainData] = {
-    val nTest = (in.count * 0.25).toLong
+    val nTest = (in.count * 0.15).toLong
 
     val td =
       TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt"))
@@ -106,9 +106,10 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
       counter <- Ref.of[F, Long](0L)
       _ <-
         data
-          .map(d => s"${d.cls}\t${d.ref}\t${normalisedText(d.text)}")
+          .filter(_.text.nonEmpty)
+          .map(d => s"${d.cls}\t${fixRef(d.ref)}\t${normalisedText(d.text)}")
           .evalTap(_ => counter.update(_ + 1))
-          .intersperse("\n")
+          .intersperse("\r\n")
           .through(fs2.text.utf8Encode)
           .through(fs2.io.file.writeAll(target, blocker))
           .compile
@@ -119,13 +120,16 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift](
   }
 
   def normalisedText(text: String): String =
-    text.replaceAll("[\n\t]+", " ")
+    text.replaceAll("[\n\r\t]+", " ")
+
+  def fixRef(str: String): String =
+    str.replace('\t', '_')
 
   def amendProps(
       trainData: TrainData,
       props: Map[String, String]
   ): Map[String, String] =
-    prepend("2", props) ++ Map(
+    prepend("2.", props) ++ Map(
       "trainFile"   -> trainData.train.normalize().toAbsolutePath().toString(),
       "testFile"    -> trainData.test.normalize().toAbsolutePath().toString(),
       "serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString()
diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
index 6c11fecf..013cd215 100644
--- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@@ -3,7 +3,8 @@ package docspell.joex.learn
 import cats.data.Kleisli
 import cats.data.OptionT
 import cats.effect._
-import fs2.Stream
+import cats.implicits._
+import fs2.{Pipe, Stream}
 
 import docspell.analysis.TextAnalyser
 import docspell.analysis.nlp.ClassifierModel
@@ -12,9 +13,13 @@ import docspell.backend.ops.OCollective
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.scheduler._
+import docspell.store.queries.QItem
 import docspell.store.records.RClassifierSetting
 
+import bitpeace.MimetypeHint
+
 object LearnClassifierTask {
+  val noClass = "__NONE__"
 
   type Args = LearnClassifierArgs
 
@@ -31,29 +36,58 @@ object LearnClassifierTask {
         sett <- findActiveSettings[F](ctx, cfg)
         data = selectItems(
           ctx,
-          math.min(cfg.classification.itemCount, sett.itemCount),
+          math.min(cfg.classification.itemCount, sett.itemCount).toLong,
           sett.category.getOrElse("")
         )
         _ <- OptionT.liftF(
           analyser
             .classifier(blocker)
-            .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx)))
+            .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker)))
         )
       } yield ())
         .getOrElseF(logInactiveWarning(ctx.logger))
     }
 
-  private def handleModel[F[_]](
-      ctx: Context[F, Args]
+  private def handleModel[F[_]: Sync: ContextShift](
+      ctx: Context[F, Args],
+      blocker: Blocker
   )(trainedModel: ClassifierModel): F[Unit] =
-    ???
+    for {
+      oldFile <- ctx.store.transact(
+        RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId))
+      )
+      _ <- ctx.logger.info("Storing new trained model")
+      fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096)
+      newFile <-
+        ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError
+      _ <- ctx.store.transact(
+        RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id))
+      )
+      _ <- ctx.logger.debug(s"New model stored at file ${newFile.id}")
+      _ <- oldFile match {
+        case Some(fid) =>
+          ctx.logger.debug(s"Deleting old model file ${fid.id}") *>
+            ctx.store.bitpeace.delete(fid.id).compile.drain
+        case None => ().pure[F]
+      }
+    } yield ()
 
   private def selectItems[F[_]](
       ctx: Context[F, Args],
-      max: Int,
+      max: Long,
       category: String
-  ): Stream[F, Data] =
-    ???
+  ): Stream[F, Data] = {
+    val connStream =
+      for {
+        item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
+        tt   <- Stream.eval(QItem.resolveTextAndTag(ctx.args.collective, item, category))
+      } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
+    ctx.store.transact(connStream.filter(_.text.nonEmpty))
+  }
+
+  private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] =
+    if (max <= 0) identity
+    else _.take(max)
 
   private def findActiveSettings[F[_]: Sync](
       ctx: Context[F, Args],
diff --git a/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala b/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala
index e4a67538..3a992b71 100644
--- a/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala
+++ b/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala
@@ -67,8 +67,8 @@ trait DoobieSyntax {
       Fragment.const(" FROM ") ++ table ++ this.where(where)
 
   def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment =
-    Fragment.const("SELECT DISTINCT(") ++ commas(cols.map(_.f)) ++
-      Fragment.const(") FROM ") ++ table ++ this.where(where)
+    Fragment.const("SELECT DISTINCT ") ++ commas(cols.map(_.f)) ++
+      Fragment.const(" FROM ") ++ table ++ this.where(where)
 
   def selectCount(col: Column, table: Fragment, where: Fragment): Fragment =
     Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this
diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
index 1240d4a7..312523ce 100644
--- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
@@ -7,6 +7,7 @@ import cats.effect.concurrent.Ref
 import cats.implicits._
 import fs2.Stream
 
+import docspell.common.syntax.all._
 import docspell.common.{IdRef, _}
 import docspell.store.Store
 import docspell.store.impl.Implicits._
@@ -615,4 +616,74 @@ object QItem {
       .query[NameAndNotes]
       .streamWithChunkSize(chunkSize)
   }
+
+  def findAllNewesFirst(
+      collective: Ident,
+      chunkSize: Int
+  ): Stream[ConnectionIO, Ident] = {
+    val cols = Seq(RItem.Columns.id)
+    (selectSimple(cols, RItem.table, RItem.Columns.cid.is(collective)) ++
+      orderBy(RItem.Columns.created.desc))
+      .query[Ident]
+      .streamWithChunkSize(chunkSize)
+  }
+
+  case class TagName(id: Ident, name: String)
+  case class TextAndTag(itemId: Ident, text: String, tag: Option[TagName])
+
+  def resolveTextAndTag(
+      collective: Ident,
+      itemId: Ident,
+      tagCategory: String
+  ): ConnectionIO[TextAndTag] = {
+    val aId    = RAttachment.Columns.id.prefix("a")
+    val aItem  = RAttachment.Columns.itemId.prefix("a")
+    val mId    = RAttachmentMeta.Columns.id.prefix("m")
+    val mText  = RAttachmentMeta.Columns.content.prefix("m")
+    val tiItem = RTagItem.Columns.itemId.prefix("ti")
+    val tiTag  = RTagItem.Columns.tagId.prefix("ti")
+    val tId    = RTag.Columns.tid.prefix("t")
+    val tName  = RTag.Columns.name.prefix("t")
+    val tCat   = RTag.Columns.category.prefix("t")
+    val iId    = RItem.Columns.id.prefix("i")
+    val iColl  = RItem.Columns.cid.prefix("i")
+
+    val cte = withCTE(
+      "tags" -> selectSimple(
+        Seq(tiItem, tId, tName),
+        RTagItem.table ++ fr"ti INNER JOIN" ++
+          RTag.table ++ fr"t ON" ++ tId.is(tiTag),
+        and(tiItem.is(itemId), tCat.is(tagCategory))
+      )
+    )
+
+    val cols = Seq(mText, tId, tName)
+
+    val from = RItem.table ++ fr"i INNER JOIN" ++
+      RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ fr"INNER JOIN" ++
+      RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ fr"LEFT JOIN" ++
+      fr"tags t ON" ++ RTagItem.Columns.itemId.prefix("t").is(iId)
+
+    val where =
+      and(
+        iId.is(itemId),
+        iColl.is(collective),
+        mText.isNotNull,
+        mText.isNot("")
+      )
+
+    val q = cte ++ selectDistinct(cols, from, where)
+    for {
+      _ <- logger.ftrace[ConnectionIO](
+        s"query: $q  (${itemId.id}, ${collective.id}, ${tagCategory})"
+      )
+      texts <- q.query[(String, Option[TagName])].to[List]
+      _ <- logger.ftrace[ConnectionIO](
+        s"Got ${texts.size} text and tag entries for item ${itemId.id}"
+      )
+      tag = texts.headOption.flatMap(_._2)
+      txt = texts.map(_._1).mkString(" --n-- ")
+    } yield TextAndTag(itemId, txt, tag)
+  }
+
 }
diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
index c15f870c..680741a0 100644
--- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
+++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala
@@ -61,6 +61,9 @@ object RClassifierSetting {
     sql.update.run
   }
 
+  def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] =
+    updateRow(table, cid.is(coll), fileId.setTo(fid)).update.run
+
   def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
     for {
       n1 <- updateRow(

From 237b96062553439cea4c38d3aa7b4a3518aeee1d Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 1 Sep 2020 21:51:57 +0200
Subject: [PATCH 05/10] Guess a tag on item processing using a trained model if
 available

---
 .../docspell/joex/process/ItemData.scala      |  3 +
 .../docspell/joex/process/ProcessItem.scala   |  4 +-
 .../docspell/joex/process/TextAnalysis.scala  | 59 +++++++++++++++++--
 3 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
index d4f83fc2..af9a3db2 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala
@@ -38,6 +38,9 @@ case class ItemData(
     copy(metas = next)
   }
 
+  def appendTags(tags: Seq[String]): ItemData =
+    copy(tags = (this.tags ++ tags.toList).distinct)
+
   def changeMeta(
       attachId: Ident,
       f: RAttachmentMeta => RAttachmentMeta
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
index 7b8b6431..fb777b24 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala
@@ -34,12 +34,12 @@ object ProcessItem {
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
     processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item)
 
-  def analysisOnly[F[_]: Sync](
+  def analysisOnly[F[_]: Sync: ContextShift](
       cfg: Config,
       analyser: TextAnalyser[F],
       regexNer: RegexNerFile[F]
   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
-    TextAnalysis[F](analyser, regexNer)(item)
+    TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item)
       .flatMap(FindProposal[F](cfg.processing))
       .flatMap(EvalProposals[F])
       .flatMap(SaveProposals[F])
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index 92975a70..039f52e7 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -1,23 +1,32 @@
 package docspell.joex.process
 
+import cats.data.OptionT
 import cats.effect._
 import cats.implicits._
 
 import docspell.analysis.TextAnalyser
+import docspell.analysis.nlp.ClassifierModel
 import docspell.analysis.nlp.StanfordNerSettings
+import docspell.analysis.nlp.TextClassifier
 import docspell.common._
+import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachmentMeta
+import docspell.store.records.RClassifierSetting
+
+import bitpeace.RangeDef
 
 object TextAnalysis {
+  type Args = ProcessItemArgs
 
-  def apply[F[_]: Sync](
+  def apply[F[_]: Sync: ContextShift](
+      cfg: Config.TextAnalysis,
       analyser: TextAnalyser[F],
       nerFile: RegexNerFile[F]
-  )(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
+  )(item: ItemData): Task[F, Args, ItemData] =
     Task { ctx =>
       for {
         _ <- ctx.logger.info("Starting text analysis")
@@ -34,11 +43,14 @@ object TextAnalysis {
         e <- s
         _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
         v = t.toVector
-      } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
+        tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value
+      } yield item
+        .copy(metas = v.map(_._1), dateLabels = v.map(_._2))
+        .appendTags(tag.toSeq)
     }
 
   def annotateAttachment[F[_]: Sync](
-      ctx: Context[F, ProcessItemArgs],
+      ctx: Context[F, Args],
       analyser: TextAnalyser[F],
       nerFile: RegexNerFile[F]
   )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = {
@@ -54,4 +66,43 @@ object TextAnalysis {
       )
     } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
   }
+
+  def predictTag[F[_]: Sync: ContextShift](
+      ctx: Context[F, Args],
+      cfg: Config.TextAnalysis,
+      metas: Vector[RAttachmentMeta],
+      classifier: TextClassifier[F]
+  ): OptionT[F, String] =
+    for {
+      model <- findActiveModel(ctx, cfg)
+      _     <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
+      text = metas.flatMap(_.content).mkString("   ------   ")
+      modelData =
+        ctx.store.bitpeace
+          .get(model.id)
+          .unNoneTerminate
+          .through(ctx.store.bitpeace.fetchData2(RangeDef.all))
+      cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir =>
+        val modelFile = dir.resolve("model.ser.gz")
+        modelData
+          .through(fs2.io.file.writeAll(modelFile, ctx.blocker))
+          .compile
+          .drain
+          .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
+
+      })
+      _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
+    } yield cls
+
+  private def findActiveModel[F[_]: Sync](
+      ctx: Context[F, Args],
+      cfg: Config.TextAnalysis
+  ): OptionT[F, Ident] =
+    if (cfg.classification.enabled)
+      OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective)))
+        .filter(_.enabled)
+        .mapFilter(_.fileId)
+    else
+      OptionT.none
+
 }

From 8677eca6d4af715259b1d6897b8a587b4f089782 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 1 Sep 2020 21:59:31 +0200
Subject: [PATCH 06/10] Fix setting default in dropdown

---
 .../webapp/src/main/elm/Comp/ClassifierSettingsForm.elm  | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
index ef6a7638..23e440cd 100644
--- a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
+++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm
@@ -54,7 +54,7 @@ init flags sett =
     in
     ( { enabled = sett.enabled
       , categoryModel = Comp.FixedDropdown.initString []
-      , category = Nothing
+      , category = sett.category
       , scheduleModel = cem
       , schedule = Data.Validated.Unknown newSchedule
       , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
@@ -92,7 +92,12 @@ update flags msg model =
             in
             ( { model
                 | categoryModel = Comp.FixedDropdown.initString categories
-                , category = List.head categories
+                , category =
+                    if model.category == Nothing then
+                        List.head categories
+
+                    else
+                        model.category
               }
             , Cmd.none
             )

From f9fcee81a5f141c51b2d93cadd01e98b996b8d65 Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 1 Sep 2020 23:56:57 +0200
Subject: [PATCH 07/10] Add start-now button for train-classifier task

---
 .../scala/docspell/backend/BackendApp.scala   |  2 +-
 .../docspell/backend/ops/OCollective.scala    | 20 +++++-
 .../src/main/resources/docspell-openapi.yml   | 22 +++++++
 .../restserver/routes/CollectiveRoutes.scala  |  6 ++
 modules/webapp/src/main/elm/Api.elm           | 14 ++++
 .../main/elm/Comp/CollectiveSettingsForm.elm  | 66 +++++++++++++++----
 6 files changed, 115 insertions(+), 15 deletions(-)

diff --git a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
index a9572832..be76d45b 100644
--- a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
+++ b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala
@@ -53,7 +53,7 @@ object BackendApp {
       loginImpl      <- Login[F](store)
       signupImpl     <- OSignup[F](store)
       joexImpl       <- OJoex(JoexClient(httpClient), store)
-      collImpl       <- OCollective[F](store, utStore, joexImpl)
+      collImpl       <- OCollective[F](store, utStore, queue, joexImpl)
       sourceImpl     <- OSource[F](store)
       tagImpl        <- OTag[F](store)
       equipImpl      <- OEquipment[F](store)
diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
index 955a4649..5e9b5aaf 100644
--- a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
+++ b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala
@@ -8,12 +8,13 @@ import docspell.backend.PasswordCrypt
 import docspell.backend.ops.OCollective._
 import docspell.common._
 import docspell.store.queries.QCollective
+import docspell.store.queue.JobQueue
 import docspell.store.records._
 import docspell.store.usertask.UserTask
 import docspell.store.usertask.UserTaskStore
 import docspell.store.{AddResult, Store}
 
-import com.github.eikek.calev.CalEvent
+import com.github.eikek.calev._
 
 trait OCollective[F[_]] {
 
@@ -49,6 +50,7 @@ trait OCollective[F[_]] {
 
   def findEnabledSource(sourceId: Ident): F[Option[RSource]]
 
+  def startLearnClassifier(collective: Ident): F[Unit]
 }
 
 object OCollective {
@@ -102,6 +104,7 @@ object OCollective {
   def apply[F[_]: Effect](
       store: Store[F],
       uts: UserTaskStore[F],
+      queue: JobQueue[F],
       joex: OJoex[F]
   ): Resource[F, OCollective[F]] =
     Resource.pure[F, OCollective[F]](new OCollective[F] {
@@ -131,6 +134,21 @@ object OCollective {
           _ <- joex.notifyAllNodes
         } yield ()
 
+      def startLearnClassifier(collective: Ident): F[Unit] =
+        for {
+          id <- Ident.randomId[F]
+          ut <- UserTask(
+            id,
+            LearnClassifierArgs.taskName,
+            true,
+            CalEvent(WeekdayComponent.All, DateEvent.All, TimeEvent.All),
+            LearnClassifierArgs(collective)
+          ).encode.toPeriodicTask(AccountId(collective, LearnClassifierArgs.taskName))
+          job <- ut.toJob
+          _   <- queue.insert(job)
+          _   <- joex.notifyAllNodes
+        } yield ()
+
       def findSettings(collective: Ident): F[Option[OCollective.Settings]] =
         store.transact(RCollective.getSettings(collective))
 
diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml
index 1a20db8d..a03a0e2e 100644
--- a/modules/restapi/src/main/resources/docspell-openapi.yml
+++ b/modules/restapi/src/main/resources/docspell-openapi.yml
@@ -1047,6 +1047,28 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ContactList"
+
+  /sec/collective/classifier/startonce:
+    post:
+      tags: [ Collective ]
+      summary: Starts the learn-classifier task
+      description: |
+        If the collective has classification enabled, this will submit
+        the task for learning a classifier from existing data. This
+        task is usally run periodically as determined by the
+        collective settings.
+
+        The request is empty, settings are used from the collective.
+      security:
+        - authTokenHeader: []
+      responses:
+        200:
+          description: Ok
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/BasicResult"
+
   /sec/user:
     get:
       tags: [ Collective ]
diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
index 2aed289f..bf7eaddd 100644
--- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
+++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala
@@ -88,6 +88,12 @@ object CollectiveRoutes {
           resp <- Ok(ContactList(res.map(Conversions.mkContact)))
         } yield resp
 
+      case POST -> Root / "classifier" / "startonce" =>
+        for {
+          _    <- backend.collective.startLearnClassifier(user.account.collective)
+          resp <- Ok(BasicResult(true, "Task submitted"))
+        } yield resp
+
       case GET -> Root =>
         for {
           collDb <- backend.collective.find(user.account.collective)
diff --git a/modules/webapp/src/main/elm/Api.elm b/modules/webapp/src/main/elm/Api.elm
index 10bcf7ff..ccba8570 100644
--- a/modules/webapp/src/main/elm/Api.elm
+++ b/modules/webapp/src/main/elm/Api.elm
@@ -88,6 +88,7 @@ module Api exposing
     , setItemNotes
     , setTags
     , setUnconfirmed
+    , startClassifier
     , startOnceNotifyDueItems
     , startOnceScanMailbox
     , startReIndex
@@ -795,6 +796,19 @@ versionInfo flags receive =
 --- Collective
 
 
+startClassifier :
+    Flags
+    -> (Result Http.Error BasicResult -> msg)
+    -> Cmd msg
+startClassifier flags receive =
+    Http2.authPost
+        { url = flags.config.baseUrl ++ "/api/v1/sec/collective/classifier/startonce"
+        , account = getAccount flags
+        , body = Http.emptyBody
+        , expect = Http.expectJson receive Api.Model.BasicResult.decoder
+        }
+
+
 getTagCloud : Flags -> (Result Http.Error TagCloud -> msg) -> Cmd msg
 getTagCloud flags receive =
     Http2.authGet
diff --git a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm
index 87696d85..1efef12d 100644
--- a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm
+++ b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm
@@ -30,6 +30,7 @@ type alias Model =
     , fullTextConfirmText : String
     , fullTextReIndexResult : Maybe BasicResult
     , classifierModel : Comp.ClassifierSettingsForm.Model
+    , startClassifierResult : Maybe BasicResult
     }
 
 
@@ -60,6 +61,7 @@ init flags settings =
       , fullTextConfirmText = ""
       , fullTextReIndexResult = Nothing
       , classifierModel = cm
+      , startClassifierResult = Nothing
       }
     , Cmd.map ClassifierSettingMsg cc
     )
@@ -91,6 +93,8 @@ type Msg
     | TriggerReIndexResult (Result Http.Error BasicResult)
     | ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg
     | SaveSettings
+    | StartClassifierTask
+    | StartClassifierResp (Result Http.Error BasicResult)
 
 
 update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings )
@@ -169,12 +173,30 @@ update flags msg model =
                 _ ->
                     ( model, Cmd.none, Nothing )
 
+        StartClassifierTask ->
+            ( model, Api.startClassifier flags StartClassifierResp, Nothing )
+
+        StartClassifierResp (Ok br) ->
+            ( { model | startClassifierResult = Just br }
+            , Cmd.none
+            , Nothing
+            )
+
+        StartClassifierResp (Err err) ->
+            ( { model
+                | startClassifierResult =
+                    Just (BasicResult False (Util.Http.errorToString err))
+              }
+            , Cmd.none
+            , Nothing
+            )
+
 
 view : Flags -> UiSettings -> Model -> Html Msg
 view flags settings model =
     div
         [ classList
-            [ ( "ui form", True )
+            [ ( "ui form error success", True )
             , ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
             , ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
             ]
@@ -250,18 +272,7 @@ view flags settings model =
                 [ text "This starts a task that clears the full-text index and re-indexes all your data again."
                 , text "You must type OK before clicking the button to avoid accidental re-indexing."
                 ]
-            , div
-                [ classList
-                    [ ( "ui message", True )
-                    , ( "error", Maybe.map .success model.fullTextReIndexResult == Just False )
-                    , ( "success", Maybe.map .success model.fullTextReIndexResult == Just True )
-                    , ( "hidden invisible", model.fullTextReIndexResult == Nothing )
-                    ]
-                ]
-                [ Maybe.map .message model.fullTextReIndexResult
-                    |> Maybe.withDefault ""
-                    |> text
-                ]
+            , renderResultMessage model.fullTextReIndexResult
             ]
         , h3
             [ classList
@@ -279,6 +290,19 @@ view flags settings model =
             ]
             [ Html.map ClassifierSettingMsg
                 (Comp.ClassifierSettingsForm.view model.classifierModel)
+            , div [ class "ui vertical segment" ]
+                [ button
+                    [ classList
+                        [ ( "ui small secondary basic button", True )
+                        , ( "disabled", not model.classifierModel.enabled )
+                        ]
+                    , title "Starts a task to train a classifier"
+                    , onClick StartClassifierTask
+                    ]
+                    [ text "Start now"
+                    ]
+                , renderResultMessage model.startClassifierResult
+                ]
             ]
         , div [ class "ui divider" ] []
         , button
@@ -291,3 +315,19 @@ view flags settings model =
             [ text "Save"
             ]
         ]
+
+
+renderResultMessage : Maybe BasicResult -> Html msg
+renderResultMessage result =
+    div
+        [ classList
+            [ ( "ui message", True )
+            , ( "error", Maybe.map .success result == Just False )
+            , ( "success", Maybe.map .success result == Just True )
+            , ( "hidden invisible", result == Nothing )
+            ]
+        ]
+        [ Maybe.map .message result
+            |> Maybe.withDefault ""
+            |> text
+        ]

From 4309bd8dfd6976697dbbe74ec600d53140c767ec Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Tue, 1 Sep 2020 23:57:27 +0200
Subject: [PATCH 08/10] Some cleanup

---
 modules/joex/src/main/resources/reference.conf              | 6 +++---
 .../scala/docspell/joex/learn/LearnClassifierTask.scala     | 5 ++++-
 .../src/main/scala/docspell/joex/process/TextAnalysis.scala | 6 +++---
 .../store/src/main/scala/docspell/store/queries/QItem.scala | 5 +++--
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf
index e09bfd3b..23ec5b47 100644
--- a/modules/joex/src/main/resources/reference.conf
+++ b/modules/joex/src/main/resources/reference.conf
@@ -299,9 +299,8 @@ docspell.joex {
       # multiple are given, they are all tried and the "best" is
       # chosen at the end. See
       # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
-      # for more info about these settings. The settings are almost
-      # identical to them, as they yielded best results with *my*
-      # dataset.
+      # for more info about these settings. The settings here yielded
+      # good results with *my* dataset.
       #
       # Enclose regexps in triple quotes.
       classifiers = [
@@ -312,6 +311,7 @@ docspell.joex {
           "maxNGramLeng" = "4"
           "minNGramLeng" = "1"
           "splitWordShape" = "chris4"
+          "intern" = "true" # makes it slower but saves memory
         }
       ]
     }
diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
index 013cd215..c3d6e3f9 100644
--- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
+++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala
@@ -20,6 +20,7 @@ import bitpeace.MimetypeHint
 
 object LearnClassifierTask {
   val noClass = "__NONE__"
+  val pageSep = " --n-- "
 
   type Args = LearnClassifierArgs
 
@@ -80,7 +81,9 @@ object LearnClassifierTask {
     val connStream =
       for {
         item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max))
-        tt   <- Stream.eval(QItem.resolveTextAndTag(ctx.args.collective, item, category))
+        tt <- Stream.eval(
+          QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep)
+        )
       } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim)
     ctx.store.transact(connStream.filter(_.text.nonEmpty))
   }
diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
index 039f52e7..ebb0894a 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -11,6 +11,7 @@ import docspell.analysis.nlp.TextClassifier
 import docspell.common._
 import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
+import docspell.joex.learn.LearnClassifierTask
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
@@ -76,7 +77,7 @@ object TextAnalysis {
     for {
       model <- findActiveModel(ctx, cfg)
       _     <- OptionT.liftF(ctx.logger.info(s"Guessing tag …"))
-      text = metas.flatMap(_.content).mkString("   ------   ")
+      text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
       modelData =
         ctx.store.bitpeace
           .get(model.id)
@@ -89,8 +90,7 @@ object TextAnalysis {
           .compile
           .drain
           .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text))
-
-      })
+      }).filter(_ != LearnClassifierTask.noClass)
       _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}"))
     } yield cls
 
diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
index 312523ce..d3d2653e 100644
--- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
@@ -634,7 +634,8 @@ object QItem {
   def resolveTextAndTag(
       collective: Ident,
       itemId: Ident,
-      tagCategory: String
+      tagCategory: String,
+      pageSep: String
   ): ConnectionIO[TextAndTag] = {
     val aId    = RAttachment.Columns.id.prefix("a")
     val aItem  = RAttachment.Columns.itemId.prefix("a")
@@ -682,7 +683,7 @@ object QItem {
         s"Got ${texts.size} text and tag entries for item ${itemId.id}"
       )
       tag = texts.headOption.flatMap(_._2)
-      txt = texts.map(_._1).mkString(" --n-- ")
+      txt = texts.map(_._1).mkString(pageSep)
     } yield TextAndTag(itemId, txt, tag)
   }
 

From 145c3084614f199269c64b92a6dcaf1f79d950ec Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Wed, 2 Sep 2020 00:18:55 +0200
Subject: [PATCH 09/10] Update documentation

---
 website/elm/Feature.elm                      |  2 +-
 website/site/content/docs/webapp/metadata.md | 25 ++++++++++++++++----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/website/elm/Feature.elm b/website/elm/Feature.elm
index 246aa7ad..4d2fb734 100644
--- a/website/elm/Feature.elm
+++ b/website/elm/Feature.elm
@@ -67,7 +67,7 @@ Text is extracted from all files. For scanned documents/images, OCR is used by u
     , { image = "img/analyze-feature.png"
       , header = "Text Analysis"
       , description = """
-The extracted text is analyzed and is used to find properties that can be annotated to your documents automatically.
+The extracted text is analyzed using ML techniques to find properties that can be annotated to your documents automatically.
 """
       }
     , { image = "img/filetype-feature.svg"
diff --git a/website/site/content/docs/webapp/metadata.md b/website/site/content/docs/webapp/metadata.md
index 36e5d57c..0f5e23b2 100644
--- a/website/site/content/docs/webapp/metadata.md
+++ b/website/site/content/docs/webapp/metadata.md
@@ -33,11 +33,26 @@ workflows, a tag category *state* may exist that includes tags like
 "assignment" semantics. Docspell doesn't propose any workflow, but it
 can help to implement some.
 
-The tags are *not* taken into account when creating suggestions from
-analyzed text yet. However, PDF files may contain metadata itself and
-if there is a metadata *keywords* list, these keywords are matched
-against the tags in the database. If they match, the item is tagged
-automatically.
+Docspell can try to predict a tag for new incoming documents
+automatically based on your existing data. This requires to train an
+algorithm. There are some caveats: the more data you have correctly
+tagged, the better are the results. So it won't work well for maybe
+the first 100 documents. Then the tags must somehow relate to a
+pattern in the document text. Tags like *todo* or *waiting* probably
+won't work, obviously. But the typical "document type" tag, like
+*invoice* and *receipt* is a good fit! That is why you need to provide
+a tag category so only sensible tags are being learned. The algorithm
+goes through all your items and learns patterns in the text that
+relate to the given tags. This training step can be run periodically,
+as specified in your collective settings such that docspell keeps
+learning from your already tagged data! More information about the
+algorithm can be found in the config, where it is possible to
+fine-tune this process.
+
+Another way to have items tagged automatically is when an input PDF
+file contains a list of keywords in its metadata section (this only
+applies to PDF files). These keywords are then matched against the
+tags in the database. If they match, the item is tagged with them.
 
 
 ## Organization and Person

From afbe9554b6b14c60d6e4395ca18397c05a21b15c Mon Sep 17 00:00:00 2001
From: Eike Kettner <eike.kettner@posteo.de>
Date: Wed, 2 Sep 2020 22:23:08 +0200
Subject: [PATCH 10/10] Update joex nixos module

---
 nix/module-joex.nix | 68 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/nix/module-joex.nix b/nix/module-joex.nix
index d550c2d3..7619711f 100644
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -95,6 +95,21 @@ let
         enabled = true;
         file-cache-time = "1 minute";
       };
+      classification = {
+        enabled = true;
+        item-count = 0;
+        classifiers = [
+          { "useSplitWords" = "true";
+            "splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.'';
+            "splitWordsIgnoreRegexp" = ''\s+'';
+            "useSplitPrefixSuffixNGrams" = "true";
+            "maxNGramLeng" = "4";
+            "minNGramLeng" = "1";
+            "splitWordShape" = "chris4";
+            "intern" = "true";
+          }
+        ];
+      };
       working-dir = "/tmp/docspell-analysis";
     };
     processing = {
@@ -736,6 +751,59 @@ in {
               default = defaults.text-analysis.regex-ner;
               description = "";
             };
+
+            classification = mkOption {
+              type = types.submodule({
+                options = {
+                  enabled = mkOption {
+                    type = types.bool;
+                    default = defaults.text-analysis.classification.enabled;
+                    description = ''
+                      Whether to enable classification globally. Each collective can
+                      decide to disable it. If it is disabled here, no collective
+                      can use classification.
+                    '';
+                  };
+                  item-count = mkOption {
+                    type = types.int;
+                    default = defaults.text-analysis.classification.item-count;
+                    description = ''
+                      If concerned with memory consumption, this restricts the
+                      number of items to consider. More are better for training. A
+                      negative value or zero means no train on all items.
+                    '';
+                  };
+                  classifiers = mkOption {
+                    type = types.listOf types.attrs;
+                    default = defaults.text-analysis.classification.classifiers;
+                    description = ''
+                      These settings are used to configure the classifier. If
+                      multiple are given, they are all tried and the "best" is
+                      chosen at the end. See
+                      https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html
+                      for more info about these settings. The settings here yielded
+                      good results with *my* dataset.
+                    '';
+                  };
+
+                };
+              });
+              default = defaults.text-analysis.classification;
+              description = ''
+                Settings for doing document classification.
+
+                This works by learning from existing documents. A collective can
+                specify a tag category and the system will try to predict a tag
+                from this category for new incoming documents.
+
+                This requires a satstical model that is computed from all
+                existing documents. This process is run periodically as
+                configured by the collective. It may require a lot of memory,
+                depending on the amount of data.
+
+                It utilises this NLP library: https://nlp.stanford.edu/.
+              '';
+            };
           };
         });
         default = defaults.text-analysis;