Add classifier settings

This commit is contained in:
Eike Kettner 2020-08-28 22:17:49 +02:00
parent 53fdb100ab
commit 8c4f2e702b
17 changed files with 649 additions and 56 deletions

View File

@ -15,7 +15,9 @@ trait OCollective[F[_]] {
def find(name: Ident): F[Option[RCollective]]
def updateSettings(collective: Ident, lang: OCollective.Settings): F[AddResult]
def updateSettings(collective: Ident, settings: OCollective.Settings): F[AddResult]
def findSettings(collective: Ident): F[Option[OCollective.Settings]]
def listUser(collective: Ident): F[Vector[RUser]]
@ -55,6 +57,8 @@ object OCollective {
type Settings = RCollective.Settings
val Settings = RCollective.Settings
type Classifier = RClassifierSetting.Classifier
val Classifier = RClassifierSetting.Classifier
sealed trait PassChangeResult
object PassChangeResult {
@ -102,6 +106,9 @@ object OCollective {
.attempt
.map(AddResult.fromUpdate)
def findSettings(collective: Ident): F[Option[OCollective.Settings]] =
store.transact(RCollective.getSettings(collective))
def listUser(collective: Ident): F[Vector[RUser]] =
store.transact(RUser.findAll(collective, _.login))

View File

@ -0,0 +1,35 @@
package docspell.common
import docspell.common.syntax.all._
import io.circe._
import io.circe.generic.semiauto._
/** Arguments to the classify-item task.
*
* This task is run periodically and learns from existing documents
* to create a model for predicting tags of new documents. The user
* must give a tag category as a subset of possible tags..
*/
case class LearnClassifierArgs(
collective: Ident
) {
def makeSubject: String =
"Learn tags"
}
object LearnClassifierArgs {
val taskName = Ident.unsafe("learn-classifier")
implicit val jsonEncoder: Encoder[LearnClassifierArgs] =
deriveEncoder[LearnClassifierArgs]
implicit val jsonDecoder: Decoder[LearnClassifierArgs] =
deriveDecoder[LearnClassifierArgs]
def parse(str: String): Either[Throwable, LearnClassifierArgs] =
str.parseJsonAs[LearnClassifierArgs]
}

View File

@ -271,6 +271,50 @@ docspell.joex {
# file will be kept until a check for a state change is done.
file-cache-time = "1 minute"
}
# Settings for doing document classification.
#
# This works by learning from existing documents. A collective can
# specify a tag category and the system will try to predict a tag
# from this category for new incoming documents.
#
# This requires a satstical model that is computed from all
# existing documents. This process is run periodically as
# configured by the collective. It may require a lot of memory,
# depending on the amount of data.
#
# It utilises this NLP library: https://nlp.stanford.edu/.
classification {
# Whether to enable classification globally. Each collective can
# decide to disable it. If it is disabled here, no collective
# can use classification.
enabled = true
# If concerned with memory consumption, this restricts the
# number of items to consider. More are better for training. A
# negative value or zero means no train on all items.
item-count = 0
# These settings are used to configure the classifier. If
# multiple are given, they are all tried and the "best" is
# chosen at the end. See
# https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups
# for more info about these settings. The settings are almost
# identical to them, as they yielded best results with *my*
# dataset.
#
# Enclose regexps in triple quotes.
classifiers = [
{ "useSplitWords" = "true"
"splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|."""
"splitWordsIgnoreRegexp" = """\s+"""
"useSplitPrefixSuffixNGrams" = "true"
"maxNGramLeng" = "4"
"minNGramLeng" = "1"
"splitWordShape" = "chris4"
}
]
}
}
# Configuration for converting files into PDFs.

View File

@ -57,7 +57,8 @@ object Config {
case class TextAnalysis(
maxLength: Int,
workingDir: Path,
regexNer: RegexNer
regexNer: RegexNer,
classification: Classification
) {
def textAnalysisConfig: TextAnalysisConfig =
@ -68,4 +69,10 @@ object Config {
}
case class RegexNer(enabled: Boolean, fileCacheTime: Duration)
case class Classification(
enabled: Boolean,
itemCount: Int,
classifiers: List[Map[String, String]]
)
}

View File

@ -3643,12 +3643,14 @@ components:
description: DateTime
type: integer
format: date-time
CollectiveSettings:
description: |
Settings for a collective.
required:
- language
- integrationEnabled
- classifier
properties:
language:
type: string
@ -3658,6 +3660,31 @@ components:
description: |
Whether the collective has the integration endpoint
enabled.
classifier:
$ref: "#/components/schemas/ClassifierSetting"
ClassifierSetting:
description: |
Settings for learning a document classifier.
required:
- enabled
- schedule
- itemCount
properties:
enabled:
type: boolean
category:
type: string
itemCount:
type: integer
format: int32
description: |
The max. number of items to learn from. The newest items
are considered.
schedule:
type: string
format: calevent
SourceList:
description: |
A list of sources.

View File

@ -10,6 +10,7 @@ import docspell.restapi.model._
import docspell.restserver.conv.Conversions
import docspell.restserver.http4s._
import com.github.eikek.calev.CalEvent
import org.http4s.HttpRoutes
import org.http4s.circe.CirceEntityDecoder._
import org.http4s.circe.CirceEntityEncoder._
@ -37,7 +38,18 @@ object CollectiveRoutes {
case req @ POST -> Root / "settings" =>
for {
settings <- req.as[CollectiveSettings]
sett = OCollective.Settings(settings.language, settings.integrationEnabled)
sett = OCollective.Settings(
settings.language,
settings.integrationEnabled,
Some(
OCollective.Classifier(
settings.classifier.enabled,
settings.classifier.schedule,
settings.classifier.itemCount,
settings.classifier.category
)
)
)
res <-
backend.collective
.updateSettings(user.account.collective, sett)
@ -46,8 +58,21 @@ object CollectiveRoutes {
case GET -> Root / "settings" =>
for {
collDb <- backend.collective.find(user.account.collective)
sett = collDb.map(c => CollectiveSettings(c.language, c.integrationEnabled))
settDb <- backend.collective.findSettings(user.account.collective)
sett = settDb.map(c =>
CollectiveSettings(
c.language,
c.integrationEnabled,
ClassifierSetting(
c.classifier.map(_.enabled).getOrElse(false),
c.classifier.flatMap(_.category),
c.classifier.map(_.itemCount).getOrElse(0),
c.classifier
.map(_.schedule)
.getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00"))
)
)
)
resp <- sett.toResponse()
} yield resp

View File

@ -0,0 +1,9 @@
CREATE TABLE `classifier_setting` (
`cid` varchar(254) not null primary key,
`enabled` boolean not null,
`schedule` varchar(254) not null,
`category` varchar(254) not null,
`file_id` varchar(254),
`created` timestamp not null,
foreign key (`cid`) references `collective`(`cid`)
);

View File

@ -0,0 +1,11 @@
CREATE TABLE "classifier_setting" (
"cid" varchar(254) not null primary key,
"enabled" boolean not null,
"schedule" varchar(254) not null,
"category" varchar(254) not null,
"item_count" int not null,
"file_id" varchar(254),
"created" timestamp not null,
foreign key ("cid") references "collective"("cid"),
foreign key ("file_id") references "filemeta"("id")
);

View File

@ -0,0 +1,106 @@
package docspell.store.records
import cats.implicits._
import docspell.common._
import docspell.store.impl.Implicits._
import docspell.store.impl._
import com.github.eikek.calev._
import doobie._
import doobie.implicits._
case class RClassifierSetting(
cid: Ident,
enabled: Boolean,
schedule: CalEvent,
category: String,
itemCount: Int,
fileId: Option[Ident],
created: Timestamp
) {}
object RClassifierSetting {
val table = fr"classifier_setting"
object Columns {
val cid = Column("cid")
val enabled = Column("enabled")
val schedule = Column("schedule")
val category = Column("category")
val itemCount = Column("item_count")
val fileId = Column("file_id")
val created = Column("created")
val all = List(cid, enabled, schedule, category, itemCount, fileId, created)
}
import Columns._
def insert(v: RClassifierSetting): ConnectionIO[Int] = {
val sql =
insertRow(
table,
all,
fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}"
)
sql.update.run
}
def updateAll(v: RClassifierSetting): ConnectionIO[Int] = {
val sql = updateRow(
table,
cid.is(v.cid),
commas(
enabled.setTo(v.enabled),
schedule.setTo(v.schedule),
category.setTo(v.category),
itemCount.setTo(v.itemCount),
fileId.setTo(v.fileId)
)
)
sql.update.run
}
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
for {
n1 <- updateRow(
table,
cid.is(v.cid),
commas(
enabled.setTo(v.enabled),
schedule.setTo(v.schedule),
itemCount.setTo(v.itemCount),
category.setTo(v.category)
)
).update.run
n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO]
} yield n1 + n2
def findById(id: Ident): ConnectionIO[Option[RClassifierSetting]] = {
val sql = selectSimple(all, table, cid.is(id))
sql.query[RClassifierSetting].option
}
def delete(coll: Ident): ConnectionIO[Int] =
deleteFrom(table, cid.is(coll)).update.run
case class Classifier(
enabled: Boolean,
schedule: CalEvent,
itemCount: Int,
category: Option[String]
) {
def toRecord(coll: Ident, created: Timestamp): RClassifierSetting =
RClassifierSetting(
coll,
enabled,
schedule,
category.getOrElse(""),
itemCount,
None,
created
)
}
}

View File

@ -61,14 +61,47 @@ object RCollective {
updateRow(table, id.is(cid), language.setTo(lang)).update.run
def updateSettings(cid: Ident, settings: Settings): ConnectionIO[Int] =
updateRow(
table,
id.is(cid),
commas(
language.setTo(settings.language),
integration.setTo(settings.integrationEnabled)
)
).update.run
for {
n1 <- updateRow(
table,
id.is(cid),
commas(
language.setTo(settings.language),
integration.setTo(settings.integrationEnabled)
)
).update.run
cls <-
Timestamp
.current[ConnectionIO]
.map(now => settings.classifier.map(_.toRecord(cid, now)))
n2 <- cls match {
case Some(cr) =>
RClassifierSetting.updateSettings(cr)
case None =>
RClassifierSetting.delete(cid)
}
} yield n1 + n2
def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = {
val cId = id.prefix("c")
val CS = RClassifierSetting.Columns
val csCid = CS.cid.prefix("cs")
val cols = Seq(
language.prefix("c"),
integration.prefix("c"),
CS.enabled.prefix("cs"),
CS.schedule.prefix("cs"),
CS.itemCount.prefix("cs"),
CS.category.prefix("cs")
)
val from = table ++ fr"c LEFT JOIN" ++
RClassifierSetting.table ++ fr"cs ON" ++ csCid.is(cId)
selectSimple(cols, from, cId.is(coll))
.query[Settings]
.option
}
def findById(cid: Ident): ConnectionIO[Option[RCollective]] = {
val sql = selectSimple(all, table, id.is(cid))
@ -112,5 +145,10 @@ object RCollective {
selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option
}
case class Settings(language: Language, integrationEnabled: Boolean)
case class Settings(
language: Language,
integrationEnabled: Boolean,
classifier: Option[RClassifierSetting.Classifier]
)
}

View File

@ -218,12 +218,12 @@ loginInfo model =
, menuEntry model
CollectiveSettingPage
[ i [ class "users circle icon" ] []
, text "Collective Settings"
, text "Collective Profile"
]
, menuEntry model
UserSettingPage
[ i [ class "user circle icon" ] []
, text "User Settings"
, text "User Profile"
]
, div [ class "divider" ] []
, menuEntry model

View File

@ -0,0 +1,199 @@
module Comp.ClassifierSettingsForm exposing
( Model
, Msg
, getSettings
, init
, update
, view
)
import Api
import Api.Model.ClassifierSetting exposing (ClassifierSetting)
import Api.Model.TagList exposing (TagList)
import Comp.CalEventInput
import Comp.FixedDropdown
import Comp.IntField
import Data.CalEvent exposing (CalEvent)
import Data.Flags exposing (Flags)
import Data.Validated exposing (Validated(..))
import Html exposing (..)
import Html.Attributes exposing (..)
import Html.Events exposing (onCheck)
import Http
import Util.Tag
type alias Model =
{ enabled : Bool
, categoryModel : Comp.FixedDropdown.Model String
, category : Maybe String
, scheduleModel : Comp.CalEventInput.Model
, schedule : Validated CalEvent
, itemCountModel : Comp.IntField.Model
, itemCount : Maybe Int
}
type Msg
= GetTagsResp (Result Http.Error TagList)
| ScheduleMsg Comp.CalEventInput.Msg
| ToggleEnabled
| CategoryMsg (Comp.FixedDropdown.Msg String)
| ItemCountMsg Comp.IntField.Msg
init : Flags -> ClassifierSetting -> ( Model, Cmd Msg )
init flags sett =
let
newSchedule =
Data.CalEvent.fromEvent sett.schedule
|> Maybe.withDefault Data.CalEvent.everyMonth
( cem, cec ) =
Comp.CalEventInput.init flags newSchedule
in
( { enabled = sett.enabled
, categoryModel = Comp.FixedDropdown.initString []
, category = Nothing
, scheduleModel = cem
, schedule = Data.Validated.Unknown newSchedule
, itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count"
, itemCount = Just sett.itemCount
}
, Cmd.batch
[ Api.getTags flags "" GetTagsResp
, Cmd.map ScheduleMsg cec
]
)
getSettings : Model -> Validated ClassifierSetting
getSettings model =
Data.Validated.map
(\sch ->
{ enabled = model.enabled
, category = model.category
, schedule =
Data.CalEvent.makeEvent sch
, itemCount = Maybe.withDefault 0 model.itemCount
}
)
model.schedule
update : Flags -> Msg -> Model -> ( Model, Cmd Msg )
update flags msg model =
case msg of
GetTagsResp (Ok tl) ->
let
categories =
Util.Tag.getCategories tl.items
|> List.sort
in
( { model
| categoryModel = Comp.FixedDropdown.initString categories
, category = List.head categories
}
, Cmd.none
)
GetTagsResp (Err _) ->
( model, Cmd.none )
ScheduleMsg lmsg ->
let
( cm, cc, ce ) =
Comp.CalEventInput.update
flags
(Data.Validated.value model.schedule)
lmsg
model.scheduleModel
in
( { model
| scheduleModel = cm
, schedule = ce
}
, Cmd.map ScheduleMsg cc
)
ToggleEnabled ->
( { model | enabled = not model.enabled }
, Cmd.none
)
CategoryMsg lmsg ->
let
( mm, ma ) =
Comp.FixedDropdown.update lmsg model.categoryModel
in
( { model
| categoryModel = mm
, category =
if ma == Nothing then
model.category
else
ma
}
, Cmd.none
)
ItemCountMsg lmsg ->
let
( im, iv ) =
Comp.IntField.update lmsg model.itemCountModel
in
( { model
| itemCountModel = im
, itemCount = iv
}
, Cmd.none
)
view : Model -> Html Msg
view model =
div []
[ div
[ class "field"
]
[ div [ class "ui checkbox" ]
[ input
[ type_ "checkbox"
, onCheck (\_ -> ToggleEnabled)
, checked model.enabled
]
[]
, label [] [ text "Enable classification" ]
, span [ class "small-info" ]
[ text "Disable document classification if not needed."
]
]
]
, div [ class "ui basic segment" ]
[ text "Document classification tries to predict a tag for new incoming documents. This "
, text "works by learning from existing documents in order to find common patterns within "
, text "the text. The more documents you have correctly tagged, the better. Learning is done "
, text "periodically based on a schedule and you need to specify a tag-group that should "
, text "be used for learning."
]
, div [ class "field" ]
[ label [] [ text "Category" ]
, Html.map CategoryMsg
(Comp.FixedDropdown.viewString model.category
model.categoryModel
)
]
, Html.map ItemCountMsg
(Comp.IntField.viewWithInfo
"The maximum number of items to learn from, order by date newest first. Use 0 to mean all."
model.itemCount
"field"
model.itemCountModel
)
, div [ class "field" ]
[ label [] [ text "Schedule" ]
, Html.map ScheduleMsg
(Comp.CalEventInput.view "" (Data.Validated.value model.schedule) model.scheduleModel)
]
]

View File

@ -10,10 +10,12 @@ module Comp.CollectiveSettingsForm exposing
import Api
import Api.Model.BasicResult exposing (BasicResult)
import Api.Model.CollectiveSettings exposing (CollectiveSettings)
import Comp.ClassifierSettingsForm
import Comp.Dropdown
import Data.Flags exposing (Flags)
import Data.Language exposing (Language)
import Data.UiSettings exposing (UiSettings)
import Data.Validated exposing (Validated)
import Html exposing (..)
import Html.Attributes exposing (..)
import Html.Events exposing (onCheck, onClick, onInput)
@ -27,44 +29,58 @@ type alias Model =
, initSettings : CollectiveSettings
, fullTextConfirmText : String
, fullTextReIndexResult : Maybe BasicResult
, classifierModel : Comp.ClassifierSettingsForm.Model
}
init : CollectiveSettings -> Model
init settings =
init : Flags -> CollectiveSettings -> ( Model, Cmd Msg )
init flags settings =
let
lang =
Data.Language.fromString settings.language
|> Maybe.withDefault Data.Language.German
( cm, cc ) =
Comp.ClassifierSettingsForm.init flags settings.classifier
in
{ langModel =
Comp.Dropdown.makeSingleList
{ makeOption =
\l ->
{ value = Data.Language.toIso3 l
, text = Data.Language.toName l
, additional = ""
}
, placeholder = ""
, options = Data.Language.all
, selected = Just lang
}
, intEnabled = settings.integrationEnabled
, initSettings = settings
, fullTextConfirmText = ""
, fullTextReIndexResult = Nothing
}
( { langModel =
Comp.Dropdown.makeSingleList
{ makeOption =
\l ->
{ value = Data.Language.toIso3 l
, text = Data.Language.toName l
, additional = ""
}
, placeholder = ""
, options = Data.Language.all
, selected = Just lang
}
, intEnabled = settings.integrationEnabled
, initSettings = settings
, fullTextConfirmText = ""
, fullTextReIndexResult = Nothing
, classifierModel = cm
}
, Cmd.map ClassifierSettingMsg cc
)
getSettings : Model -> CollectiveSettings
getSettings : Model -> Validated CollectiveSettings
getSettings model =
CollectiveSettings
(Comp.Dropdown.getSelected model.langModel
|> List.head
|> Maybe.map Data.Language.toIso3
|> Maybe.withDefault model.initSettings.language
Data.Validated.map
(\cls ->
{ language =
Comp.Dropdown.getSelected model.langModel
|> List.head
|> Maybe.map Data.Language.toIso3
|> Maybe.withDefault model.initSettings.language
, integrationEnabled = model.intEnabled
, classifier = cls
}
)
(Comp.ClassifierSettingsForm.getSettings
model.classifierModel
)
model.intEnabled
type Msg
@ -73,6 +89,8 @@ type Msg
| SetFullTextConfirm String
| TriggerReIndex
| TriggerReIndexResult (Result Http.Error BasicResult)
| ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg
| SaveSettings
update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings )
@ -85,22 +103,15 @@ update flags msg model =
nextModel =
{ model | langModel = m2 }
nextSettings =
if Comp.Dropdown.isDropdownChangeMsg m then
Just (getSettings nextModel)
else
Nothing
in
( nextModel, Cmd.map LangDropdownMsg c2, nextSettings )
( nextModel, Cmd.map LangDropdownMsg c2, Nothing )
ToggleIntegrationEndpoint ->
let
nextModel =
{ model | intEnabled = not model.intEnabled }
in
( nextModel, Cmd.none, Just (getSettings nextModel) )
( nextModel, Cmd.none, Nothing )
SetFullTextConfirm str ->
( { model | fullTextConfirmText = str }, Cmd.none, Nothing )
@ -138,6 +149,26 @@ update flags msg model =
, Nothing
)
ClassifierSettingMsg lmsg ->
let
( cm, cc ) =
Comp.ClassifierSettingsForm.update flags lmsg model.classifierModel
in
( { model
| classifierModel = cm
}
, Cmd.map ClassifierSettingMsg cc
, Nothing
)
SaveSettings ->
case getSettings model of
Data.Validated.Valid s ->
( model, Cmd.none, Just s )
_ ->
( model, Cmd.none, Nothing )
view : Flags -> UiSettings -> Model -> Html Msg
view flags settings model =
@ -232,4 +263,31 @@ view flags settings model =
|> text
]
]
, h3
[ classList
[ ( "ui dividing header", True )
, ( "invisible hidden", False )
]
]
[ text "Document Classifier"
]
, div
[ classList
[ ( "field", True )
, ( "invisible hidden", False )
]
]
[ Html.map ClassifierSettingMsg
(Comp.ClassifierSettingsForm.view model.classifierModel)
]
, div [ class "ui divider" ] []
, button
[ classList
[ ( "ui primary button", True )
, ( "disabled", getSettings model |> Data.Validated.isInvalid )
]
, onClick SaveSettings
]
[ text "Save"
]
]

View File

@ -1,5 +1,6 @@
module Data.Validated exposing
( Validated(..)
, isInvalid
, map
, map2
, map3
@ -14,6 +15,19 @@ type Validated a
| Unknown a
isInvalid : Validated a -> Bool
isInvalid v =
case v of
Valid _ ->
False
Invalid _ _ ->
True
Unknown _ ->
False
value : Validated a -> a
value va =
case va of

View File

@ -30,15 +30,21 @@ init flags =
let
( sm, sc ) =
Comp.SourceManage.init flags
( cm, cc ) =
Comp.CollectiveSettingsForm.init flags Api.Model.CollectiveSettings.empty
in
( { currentTab = Just InsightsTab
, sourceModel = sm
, userModel = Comp.UserManage.emptyModel
, settingsModel = Comp.CollectiveSettingsForm.init Api.Model.CollectiveSettings.empty
, settingsModel = cm
, insights = Api.Model.ItemInsights.empty
, submitResult = Nothing
}
, Cmd.map SourceMsg sc
, Cmd.batch
[ Cmd.map SourceMsg sc
, Cmd.map SettingsFormMsg cc
]
)

View File

@ -77,7 +77,13 @@ update flags msg model =
( model, Cmd.none )
CollectiveSettingsResp (Ok data) ->
( { model | settingsModel = Comp.CollectiveSettingsForm.init data }, Cmd.none )
let
( cm, cc ) =
Comp.CollectiveSettingsForm.init flags data
in
( { model | settingsModel = cm }
, Cmd.map SettingsFormMsg cc
)
CollectiveSettingsResp (Err _) ->
( model, Cmd.none )

View File

@ -185,10 +185,11 @@ viewSettings : Flags -> UiSettings -> Model -> List (Html Msg)
viewSettings flags settings model =
[ h2 [ class "ui header" ]
[ i [ class "cog icon" ] []
, text "Settings"
, text "Collective Settings"
]
, div [ class "ui segment" ]
[ Html.map SettingsFormMsg (Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
[ Html.map SettingsFormMsg
(Comp.CollectiveSettingsForm.view flags settings model.settingsModel)
]
, div
[ classList