From 8c4f2e702ba0b0ffbfe654a2e53bae4ed9bc90cc Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Fri, 28 Aug 2020 22:17:49 +0200 Subject: [PATCH 01/10] Add classifier settings --- .../docspell/backend/ops/OCollective.scala | 9 +- .../docspell/common/LearnClassifierArgs.scala | 35 +++ .../joex/src/main/resources/reference.conf | 44 ++++ .../src/main/scala/docspell/joex/Config.scala | 9 +- .../src/main/resources/docspell-openapi.yml | 27 +++ .../restserver/routes/CollectiveRoutes.scala | 31 ++- .../migration/mariadb/V1.9.1__classifier.sql | 9 + .../postgresql/V1.9.1__classifier.sql | 11 + .../store/records/RClassifierSetting.scala | 106 ++++++++++ .../docspell/store/records/RCollective.scala | 56 ++++- modules/webapp/src/main/elm/App/View.elm | 4 +- .../main/elm/Comp/ClassifierSettingsForm.elm | 199 ++++++++++++++++++ .../main/elm/Comp/CollectiveSettingsForm.elm | 128 ++++++++--- .../webapp/src/main/elm/Data/Validated.elm | 14 ++ .../main/elm/Page/CollectiveSettings/Data.elm | 10 +- .../elm/Page/CollectiveSettings/Update.elm | 8 +- .../main/elm/Page/CollectiveSettings/View.elm | 5 +- 17 files changed, 649 insertions(+), 56 deletions(-) create mode 100644 modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql create mode 100644 modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala create mode 100644 modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala index e3835448..48934016 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala @@ -15,7 +15,9 @@ trait OCollective[F[_]] { def find(name: Ident): F[Option[RCollective]] - def updateSettings(collective: Ident, lang: OCollective.Settings): F[AddResult] + def updateSettings(collective: Ident, settings: OCollective.Settings): F[AddResult] + + def findSettings(collective: Ident): F[Option[OCollective.Settings]] def listUser(collective: Ident): F[Vector[RUser]] @@ -55,6 +57,8 @@ object OCollective { type Settings = RCollective.Settings val Settings = RCollective.Settings + type Classifier = RClassifierSetting.Classifier + val Classifier = RClassifierSetting.Classifier sealed trait PassChangeResult object PassChangeResult { @@ -102,6 +106,9 @@ object OCollective { .attempt .map(AddResult.fromUpdate) + def findSettings(collective: Ident): F[Option[OCollective.Settings]] = + store.transact(RCollective.getSettings(collective)) + def listUser(collective: Ident): F[Vector[RUser]] = store.transact(RUser.findAll(collective, _.login)) diff --git a/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala b/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala new file mode 100644 index 00000000..9cfa9395 --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/LearnClassifierArgs.scala @@ -0,0 +1,35 @@ +package docspell.common + +import docspell.common.syntax.all._ + +import io.circe._ +import io.circe.generic.semiauto._ + +/** Arguments to the classify-item task. + * + * This task is run periodically and learns from existing documents + * to create a model for predicting tags of new documents. The user + * must give a tag category as a subset of possible tags.. + */ +case class LearnClassifierArgs( + collective: Ident +) { + + def makeSubject: String = + "Learn tags" + +} + +object LearnClassifierArgs { + + val taskName = Ident.unsafe("learn-classifier") + + implicit val jsonEncoder: Encoder[LearnClassifierArgs] = + deriveEncoder[LearnClassifierArgs] + implicit val jsonDecoder: Decoder[LearnClassifierArgs] = + deriveDecoder[LearnClassifierArgs] + + def parse(str: String): Either[Throwable, LearnClassifierArgs] = + str.parseJsonAs[LearnClassifierArgs] + +} diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 115d2893..746f7bac 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -271,6 +271,50 @@ docspell.joex { # file will be kept until a check for a state change is done. file-cache-time = "1 minute" } + + # Settings for doing document classification. + # + # This works by learning from existing documents. A collective can + # specify a tag category and the system will try to predict a tag + # from this category for new incoming documents. + # + # This requires a satstical model that is computed from all + # existing documents. This process is run periodically as + # configured by the collective. It may require a lot of memory, + # depending on the amount of data. + # + # It utilises this NLP library: https://nlp.stanford.edu/. + classification { + # Whether to enable classification globally. Each collective can + # decide to disable it. If it is disabled here, no collective + # can use classification. + enabled = true + + # If concerned with memory consumption, this restricts the + # number of items to consider. More are better for training. A + # negative value or zero means no train on all items. + item-count = 0 + + # These settings are used to configure the classifier. If + # multiple are given, they are all tried and the "best" is + # chosen at the end. See + # https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups + # for more info about these settings. The settings are almost + # identical to them, as they yielded best results with *my* + # dataset. + # + # Enclose regexps in triple quotes. + classifiers = [ + { "useSplitWords" = "true" + "splitWordsTokenizerRegexp" = """[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.""" + "splitWordsIgnoreRegexp" = """\s+""" + "useSplitPrefixSuffixNGrams" = "true" + "maxNGramLeng" = "4" + "minNGramLeng" = "1" + "splitWordShape" = "chris4" + } + ] + } } # Configuration for converting files into PDFs. diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index cb6bb9f3..a90ad61a 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -57,7 +57,8 @@ object Config { case class TextAnalysis( maxLength: Int, workingDir: Path, - regexNer: RegexNer + regexNer: RegexNer, + classification: Classification ) { def textAnalysisConfig: TextAnalysisConfig = @@ -68,4 +69,10 @@ object Config { } case class RegexNer(enabled: Boolean, fileCacheTime: Duration) + + case class Classification( + enabled: Boolean, + itemCount: Int, + classifiers: List[Map[String, String]] + ) } diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index 1a48eece..1a20db8d 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -3643,12 +3643,14 @@ components: description: DateTime type: integer format: date-time + CollectiveSettings: description: | Settings for a collective. required: - language - integrationEnabled + - classifier properties: language: type: string @@ -3658,6 +3660,31 @@ components: description: | Whether the collective has the integration endpoint enabled. + classifier: + $ref: "#/components/schemas/ClassifierSetting" + + ClassifierSetting: + description: | + Settings for learning a document classifier. + required: + - enabled + - schedule + - itemCount + properties: + enabled: + type: boolean + category: + type: string + itemCount: + type: integer + format: int32 + description: | + The max. number of items to learn from. The newest items + are considered. + schedule: + type: string + format: calevent + SourceList: description: | A list of sources. diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala index 8a84fa77..2aed289f 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala @@ -10,6 +10,7 @@ import docspell.restapi.model._ import docspell.restserver.conv.Conversions import docspell.restserver.http4s._ +import com.github.eikek.calev.CalEvent import org.http4s.HttpRoutes import org.http4s.circe.CirceEntityDecoder._ import org.http4s.circe.CirceEntityEncoder._ @@ -37,7 +38,18 @@ object CollectiveRoutes { case req @ POST -> Root / "settings" => for { settings <- req.as[CollectiveSettings] - sett = OCollective.Settings(settings.language, settings.integrationEnabled) + sett = OCollective.Settings( + settings.language, + settings.integrationEnabled, + Some( + OCollective.Classifier( + settings.classifier.enabled, + settings.classifier.schedule, + settings.classifier.itemCount, + settings.classifier.category + ) + ) + ) res <- backend.collective .updateSettings(user.account.collective, sett) @@ -46,8 +58,21 @@ object CollectiveRoutes { case GET -> Root / "settings" => for { - collDb <- backend.collective.find(user.account.collective) - sett = collDb.map(c => CollectiveSettings(c.language, c.integrationEnabled)) + settDb <- backend.collective.findSettings(user.account.collective) + sett = settDb.map(c => + CollectiveSettings( + c.language, + c.integrationEnabled, + ClassifierSetting( + c.classifier.map(_.enabled).getOrElse(false), + c.classifier.flatMap(_.category), + c.classifier.map(_.itemCount).getOrElse(0), + c.classifier + .map(_.schedule) + .getOrElse(CalEvent.unsafe("*-1/3-01 01:00:00")) + ) + ) + ) resp <- sett.toResponse() } yield resp diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql new file mode 100644 index 00000000..fb1e85cd --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.9.1__classifier.sql @@ -0,0 +1,9 @@ +CREATE TABLE `classifier_setting` ( + `cid` varchar(254) not null primary key, + `enabled` boolean not null, + `schedule` varchar(254) not null, + `category` varchar(254) not null, + `file_id` varchar(254), + `created` timestamp not null, + foreign key (`cid`) references `collective`(`cid`) +); diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql new file mode 100644 index 00000000..5e81feea --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.9.1__classifier.sql @@ -0,0 +1,11 @@ +CREATE TABLE "classifier_setting" ( + "cid" varchar(254) not null primary key, + "enabled" boolean not null, + "schedule" varchar(254) not null, + "category" varchar(254) not null, + "item_count" int not null, + "file_id" varchar(254), + "created" timestamp not null, + foreign key ("cid") references "collective"("cid"), + foreign key ("file_id") references "filemeta"("id") +); diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala new file mode 100644 index 00000000..671a8d8f --- /dev/null +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala @@ -0,0 +1,106 @@ +package docspell.store.records + +import cats.implicits._ + +import docspell.common._ +import docspell.store.impl.Implicits._ +import docspell.store.impl._ + +import com.github.eikek.calev._ +import doobie._ +import doobie.implicits._ + +case class RClassifierSetting( + cid: Ident, + enabled: Boolean, + schedule: CalEvent, + category: String, + itemCount: Int, + fileId: Option[Ident], + created: Timestamp +) {} + +object RClassifierSetting { + + val table = fr"classifier_setting" + + object Columns { + val cid = Column("cid") + val enabled = Column("enabled") + val schedule = Column("schedule") + val category = Column("category") + val itemCount = Column("item_count") + val fileId = Column("file_id") + val created = Column("created") + val all = List(cid, enabled, schedule, category, itemCount, fileId, created) + } + import Columns._ + + def insert(v: RClassifierSetting): ConnectionIO[Int] = { + val sql = + insertRow( + table, + all, + fr"${v.cid},${v.enabled},${v.schedule},${v.category},${v.itemCount},${v.fileId},${v.created}" + ) + sql.update.run + } + + def updateAll(v: RClassifierSetting): ConnectionIO[Int] = { + val sql = updateRow( + table, + cid.is(v.cid), + commas( + enabled.setTo(v.enabled), + schedule.setTo(v.schedule), + category.setTo(v.category), + itemCount.setTo(v.itemCount), + fileId.setTo(v.fileId) + ) + ) + sql.update.run + } + + def updateSettings(v: RClassifierSetting): ConnectionIO[Int] = + for { + n1 <- updateRow( + table, + cid.is(v.cid), + commas( + enabled.setTo(v.enabled), + schedule.setTo(v.schedule), + itemCount.setTo(v.itemCount), + category.setTo(v.category) + ) + ).update.run + n2 <- if (n1 <= 0) insert(v) else 0.pure[ConnectionIO] + } yield n1 + n2 + + def findById(id: Ident): ConnectionIO[Option[RClassifierSetting]] = { + val sql = selectSimple(all, table, cid.is(id)) + sql.query[RClassifierSetting].option + } + + def delete(coll: Ident): ConnectionIO[Int] = + deleteFrom(table, cid.is(coll)).update.run + + case class Classifier( + enabled: Boolean, + schedule: CalEvent, + itemCount: Int, + category: Option[String] + ) { + + def toRecord(coll: Ident, created: Timestamp): RClassifierSetting = + RClassifierSetting( + coll, + enabled, + schedule, + category.getOrElse(""), + itemCount, + None, + created + ) + } + +} diff --git a/modules/store/src/main/scala/docspell/store/records/RCollective.scala b/modules/store/src/main/scala/docspell/store/records/RCollective.scala index fa40e374..2487ed22 100644 --- a/modules/store/src/main/scala/docspell/store/records/RCollective.scala +++ b/modules/store/src/main/scala/docspell/store/records/RCollective.scala @@ -61,14 +61,47 @@ object RCollective { updateRow(table, id.is(cid), language.setTo(lang)).update.run def updateSettings(cid: Ident, settings: Settings): ConnectionIO[Int] = - updateRow( - table, - id.is(cid), - commas( - language.setTo(settings.language), - integration.setTo(settings.integrationEnabled) - ) - ).update.run + for { + n1 <- updateRow( + table, + id.is(cid), + commas( + language.setTo(settings.language), + integration.setTo(settings.integrationEnabled) + ) + ).update.run + cls <- + Timestamp + .current[ConnectionIO] + .map(now => settings.classifier.map(_.toRecord(cid, now))) + n2 <- cls match { + case Some(cr) => + RClassifierSetting.updateSettings(cr) + case None => + RClassifierSetting.delete(cid) + } + } yield n1 + n2 + + def getSettings(coll: Ident): ConnectionIO[Option[Settings]] = { + val cId = id.prefix("c") + val CS = RClassifierSetting.Columns + val csCid = CS.cid.prefix("cs") + + val cols = Seq( + language.prefix("c"), + integration.prefix("c"), + CS.enabled.prefix("cs"), + CS.schedule.prefix("cs"), + CS.itemCount.prefix("cs"), + CS.category.prefix("cs") + ) + val from = table ++ fr"c LEFT JOIN" ++ + RClassifierSetting.table ++ fr"cs ON" ++ csCid.is(cId) + + selectSimple(cols, from, cId.is(coll)) + .query[Settings] + .option + } def findById(cid: Ident): ConnectionIO[Option[RCollective]] = { val sql = selectSimple(all, table, id.is(cid)) @@ -112,5 +145,10 @@ object RCollective { selectSimple(all.map(_.prefix("c")), from, aId.is(attachId)).query[RCollective].option } - case class Settings(language: Language, integrationEnabled: Boolean) + case class Settings( + language: Language, + integrationEnabled: Boolean, + classifier: Option[RClassifierSetting.Classifier] + ) + } diff --git a/modules/webapp/src/main/elm/App/View.elm b/modules/webapp/src/main/elm/App/View.elm index 6906fd2f..346983e6 100644 --- a/modules/webapp/src/main/elm/App/View.elm +++ b/modules/webapp/src/main/elm/App/View.elm @@ -218,12 +218,12 @@ loginInfo model = , menuEntry model CollectiveSettingPage [ i [ class "users circle icon" ] [] - , text "Collective Settings" + , text "Collective Profile" ] , menuEntry model UserSettingPage [ i [ class "user circle icon" ] [] - , text "User Settings" + , text "User Profile" ] , div [ class "divider" ] [] , menuEntry model diff --git a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm new file mode 100644 index 00000000..ef6a7638 --- /dev/null +++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm @@ -0,0 +1,199 @@ +module Comp.ClassifierSettingsForm exposing + ( Model + , Msg + , getSettings + , init + , update + , view + ) + +import Api +import Api.Model.ClassifierSetting exposing (ClassifierSetting) +import Api.Model.TagList exposing (TagList) +import Comp.CalEventInput +import Comp.FixedDropdown +import Comp.IntField +import Data.CalEvent exposing (CalEvent) +import Data.Flags exposing (Flags) +import Data.Validated exposing (Validated(..)) +import Html exposing (..) +import Html.Attributes exposing (..) +import Html.Events exposing (onCheck) +import Http +import Util.Tag + + +type alias Model = + { enabled : Bool + , categoryModel : Comp.FixedDropdown.Model String + , category : Maybe String + , scheduleModel : Comp.CalEventInput.Model + , schedule : Validated CalEvent + , itemCountModel : Comp.IntField.Model + , itemCount : Maybe Int + } + + +type Msg + = GetTagsResp (Result Http.Error TagList) + | ScheduleMsg Comp.CalEventInput.Msg + | ToggleEnabled + | CategoryMsg (Comp.FixedDropdown.Msg String) + | ItemCountMsg Comp.IntField.Msg + + +init : Flags -> ClassifierSetting -> ( Model, Cmd Msg ) +init flags sett = + let + newSchedule = + Data.CalEvent.fromEvent sett.schedule + |> Maybe.withDefault Data.CalEvent.everyMonth + + ( cem, cec ) = + Comp.CalEventInput.init flags newSchedule + in + ( { enabled = sett.enabled + , categoryModel = Comp.FixedDropdown.initString [] + , category = Nothing + , scheduleModel = cem + , schedule = Data.Validated.Unknown newSchedule + , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count" + , itemCount = Just sett.itemCount + } + , Cmd.batch + [ Api.getTags flags "" GetTagsResp + , Cmd.map ScheduleMsg cec + ] + ) + + +getSettings : Model -> Validated ClassifierSetting +getSettings model = + Data.Validated.map + (\sch -> + { enabled = model.enabled + , category = model.category + , schedule = + Data.CalEvent.makeEvent sch + , itemCount = Maybe.withDefault 0 model.itemCount + } + ) + model.schedule + + +update : Flags -> Msg -> Model -> ( Model, Cmd Msg ) +update flags msg model = + case msg of + GetTagsResp (Ok tl) -> + let + categories = + Util.Tag.getCategories tl.items + |> List.sort + in + ( { model + | categoryModel = Comp.FixedDropdown.initString categories + , category = List.head categories + } + , Cmd.none + ) + + GetTagsResp (Err _) -> + ( model, Cmd.none ) + + ScheduleMsg lmsg -> + let + ( cm, cc, ce ) = + Comp.CalEventInput.update + flags + (Data.Validated.value model.schedule) + lmsg + model.scheduleModel + in + ( { model + | scheduleModel = cm + , schedule = ce + } + , Cmd.map ScheduleMsg cc + ) + + ToggleEnabled -> + ( { model | enabled = not model.enabled } + , Cmd.none + ) + + CategoryMsg lmsg -> + let + ( mm, ma ) = + Comp.FixedDropdown.update lmsg model.categoryModel + in + ( { model + | categoryModel = mm + , category = + if ma == Nothing then + model.category + + else + ma + } + , Cmd.none + ) + + ItemCountMsg lmsg -> + let + ( im, iv ) = + Comp.IntField.update lmsg model.itemCountModel + in + ( { model + | itemCountModel = im + , itemCount = iv + } + , Cmd.none + ) + + +view : Model -> Html Msg +view model = + div [] + [ div + [ class "field" + ] + [ div [ class "ui checkbox" ] + [ input + [ type_ "checkbox" + , onCheck (\_ -> ToggleEnabled) + , checked model.enabled + ] + [] + , label [] [ text "Enable classification" ] + , span [ class "small-info" ] + [ text "Disable document classification if not needed." + ] + ] + ] + , div [ class "ui basic segment" ] + [ text "Document classification tries to predict a tag for new incoming documents. This " + , text "works by learning from existing documents in order to find common patterns within " + , text "the text. The more documents you have correctly tagged, the better. Learning is done " + , text "periodically based on a schedule and you need to specify a tag-group that should " + , text "be used for learning." + ] + , div [ class "field" ] + [ label [] [ text "Category" ] + , Html.map CategoryMsg + (Comp.FixedDropdown.viewString model.category + model.categoryModel + ) + ] + , Html.map ItemCountMsg + (Comp.IntField.viewWithInfo + "The maximum number of items to learn from, order by date newest first. Use 0 to mean all." + model.itemCount + "field" + model.itemCountModel + ) + , div [ class "field" ] + [ label [] [ text "Schedule" ] + , Html.map ScheduleMsg + (Comp.CalEventInput.view "" (Data.Validated.value model.schedule) model.scheduleModel) + ] + ] diff --git a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm index 342473c1..87696d85 100644 --- a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm +++ b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm @@ -10,10 +10,12 @@ module Comp.CollectiveSettingsForm exposing import Api import Api.Model.BasicResult exposing (BasicResult) import Api.Model.CollectiveSettings exposing (CollectiveSettings) +import Comp.ClassifierSettingsForm import Comp.Dropdown import Data.Flags exposing (Flags) import Data.Language exposing (Language) import Data.UiSettings exposing (UiSettings) +import Data.Validated exposing (Validated) import Html exposing (..) import Html.Attributes exposing (..) import Html.Events exposing (onCheck, onClick, onInput) @@ -27,44 +29,58 @@ type alias Model = , initSettings : CollectiveSettings , fullTextConfirmText : String , fullTextReIndexResult : Maybe BasicResult + , classifierModel : Comp.ClassifierSettingsForm.Model } -init : CollectiveSettings -> Model -init settings = +init : Flags -> CollectiveSettings -> ( Model, Cmd Msg ) +init flags settings = let lang = Data.Language.fromString settings.language |> Maybe.withDefault Data.Language.German + + ( cm, cc ) = + Comp.ClassifierSettingsForm.init flags settings.classifier in - { langModel = - Comp.Dropdown.makeSingleList - { makeOption = - \l -> - { value = Data.Language.toIso3 l - , text = Data.Language.toName l - , additional = "" - } - , placeholder = "" - , options = Data.Language.all - , selected = Just lang - } - , intEnabled = settings.integrationEnabled - , initSettings = settings - , fullTextConfirmText = "" - , fullTextReIndexResult = Nothing - } + ( { langModel = + Comp.Dropdown.makeSingleList + { makeOption = + \l -> + { value = Data.Language.toIso3 l + , text = Data.Language.toName l + , additional = "" + } + , placeholder = "" + , options = Data.Language.all + , selected = Just lang + } + , intEnabled = settings.integrationEnabled + , initSettings = settings + , fullTextConfirmText = "" + , fullTextReIndexResult = Nothing + , classifierModel = cm + } + , Cmd.map ClassifierSettingMsg cc + ) -getSettings : Model -> CollectiveSettings +getSettings : Model -> Validated CollectiveSettings getSettings model = - CollectiveSettings - (Comp.Dropdown.getSelected model.langModel - |> List.head - |> Maybe.map Data.Language.toIso3 - |> Maybe.withDefault model.initSettings.language + Data.Validated.map + (\cls -> + { language = + Comp.Dropdown.getSelected model.langModel + |> List.head + |> Maybe.map Data.Language.toIso3 + |> Maybe.withDefault model.initSettings.language + , integrationEnabled = model.intEnabled + , classifier = cls + } + ) + (Comp.ClassifierSettingsForm.getSettings + model.classifierModel ) - model.intEnabled type Msg @@ -73,6 +89,8 @@ type Msg | SetFullTextConfirm String | TriggerReIndex | TriggerReIndexResult (Result Http.Error BasicResult) + | ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg + | SaveSettings update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings ) @@ -85,22 +103,15 @@ update flags msg model = nextModel = { model | langModel = m2 } - - nextSettings = - if Comp.Dropdown.isDropdownChangeMsg m then - Just (getSettings nextModel) - - else - Nothing in - ( nextModel, Cmd.map LangDropdownMsg c2, nextSettings ) + ( nextModel, Cmd.map LangDropdownMsg c2, Nothing ) ToggleIntegrationEndpoint -> let nextModel = { model | intEnabled = not model.intEnabled } in - ( nextModel, Cmd.none, Just (getSettings nextModel) ) + ( nextModel, Cmd.none, Nothing ) SetFullTextConfirm str -> ( { model | fullTextConfirmText = str }, Cmd.none, Nothing ) @@ -138,6 +149,26 @@ update flags msg model = , Nothing ) + ClassifierSettingMsg lmsg -> + let + ( cm, cc ) = + Comp.ClassifierSettingsForm.update flags lmsg model.classifierModel + in + ( { model + | classifierModel = cm + } + , Cmd.map ClassifierSettingMsg cc + , Nothing + ) + + SaveSettings -> + case getSettings model of + Data.Validated.Valid s -> + ( model, Cmd.none, Just s ) + + _ -> + ( model, Cmd.none, Nothing ) + view : Flags -> UiSettings -> Model -> Html Msg view flags settings model = @@ -232,4 +263,31 @@ view flags settings model = |> text ] ] + , h3 + [ classList + [ ( "ui dividing header", True ) + , ( "invisible hidden", False ) + ] + ] + [ text "Document Classifier" + ] + , div + [ classList + [ ( "field", True ) + , ( "invisible hidden", False ) + ] + ] + [ Html.map ClassifierSettingMsg + (Comp.ClassifierSettingsForm.view model.classifierModel) + ] + , div [ class "ui divider" ] [] + , button + [ classList + [ ( "ui primary button", True ) + , ( "disabled", getSettings model |> Data.Validated.isInvalid ) + ] + , onClick SaveSettings + ] + [ text "Save" + ] ] diff --git a/modules/webapp/src/main/elm/Data/Validated.elm b/modules/webapp/src/main/elm/Data/Validated.elm index c56f98c6..40e0f97e 100644 --- a/modules/webapp/src/main/elm/Data/Validated.elm +++ b/modules/webapp/src/main/elm/Data/Validated.elm @@ -1,5 +1,6 @@ module Data.Validated exposing ( Validated(..) + , isInvalid , map , map2 , map3 @@ -14,6 +15,19 @@ type Validated a | Unknown a +isInvalid : Validated a -> Bool +isInvalid v = + case v of + Valid _ -> + False + + Invalid _ _ -> + True + + Unknown _ -> + False + + value : Validated a -> a value va = case va of diff --git a/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm b/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm index 1b1bd53b..b8dd6a2b 100644 --- a/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm +++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/Data.elm @@ -30,15 +30,21 @@ init flags = let ( sm, sc ) = Comp.SourceManage.init flags + + ( cm, cc ) = + Comp.CollectiveSettingsForm.init flags Api.Model.CollectiveSettings.empty in ( { currentTab = Just InsightsTab , sourceModel = sm , userModel = Comp.UserManage.emptyModel - , settingsModel = Comp.CollectiveSettingsForm.init Api.Model.CollectiveSettings.empty + , settingsModel = cm , insights = Api.Model.ItemInsights.empty , submitResult = Nothing } - , Cmd.map SourceMsg sc + , Cmd.batch + [ Cmd.map SourceMsg sc + , Cmd.map SettingsFormMsg cc + ] ) diff --git a/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm b/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm index fa9ab433..7ad68e16 100644 --- a/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm +++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/Update.elm @@ -77,7 +77,13 @@ update flags msg model = ( model, Cmd.none ) CollectiveSettingsResp (Ok data) -> - ( { model | settingsModel = Comp.CollectiveSettingsForm.init data }, Cmd.none ) + let + ( cm, cc ) = + Comp.CollectiveSettingsForm.init flags data + in + ( { model | settingsModel = cm } + , Cmd.map SettingsFormMsg cc + ) CollectiveSettingsResp (Err _) -> ( model, Cmd.none ) diff --git a/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm b/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm index 513e2719..c46aacfb 100644 --- a/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm +++ b/modules/webapp/src/main/elm/Page/CollectiveSettings/View.elm @@ -185,10 +185,11 @@ viewSettings : Flags -> UiSettings -> Model -> List (Html Msg) viewSettings flags settings model = [ h2 [ class "ui header" ] [ i [ class "cog icon" ] [] - , text "Settings" + , text "Collective Settings" ] , div [ class "ui segment" ] - [ Html.map SettingsFormMsg (Comp.CollectiveSettingsForm.view flags settings model.settingsModel) + [ Html.map SettingsFormMsg + (Comp.CollectiveSettingsForm.view flags settings model.settingsModel) ] , div [ classList From 0c97b4ef762f30a0dc77dacbf3f06dac56df0752 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 31 Aug 2020 22:35:27 +0200 Subject: [PATCH 02/10] Initial impl of a text classifier based on stanford-nlp --- .../docspell/analysis/TextAnalyser.scala | 16 +- .../analysis/TextAnalysisConfig.scala | 5 +- .../analysis/nlp/ClassifierModel.scala | 5 + .../docspell/analysis/nlp/PipelineCache.scala | 12 +- .../docspell/analysis/nlp/Properties.scala | 5 +- .../analysis/nlp/StanfordNerClassifier.scala | 2 +- ...ttings.scala => StanfordNerSettings.scala} | 6 +- .../analysis/nlp/StanfordTextClassifier.scala | 149 ++++++++++++++++++ .../analysis/nlp/TextClassifier.scala | 25 +++ .../analysis/nlp/TextClassifierConfig.scala | 10 ++ .../analysis/src/test/resources/test.ser.gz | Bin 0 -> 1682 bytes .../nlp/StanfordTextClassifierSuite.scala | 76 +++++++++ .../joex/src/main/resources/reference.conf | 2 +- .../src/main/scala/docspell/joex/Config.scala | 13 +- .../joex/learn/LearnClassifierTask.scala | 64 ++++++++ .../docspell/joex/process/TextAnalysis.scala | 4 +- 16 files changed, 376 insertions(+), 18 deletions(-) create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala rename modules/analysis/src/main/scala/docspell/analysis/nlp/{StanfordSettings.scala => StanfordNerSettings.scala} (88%) create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala create mode 100644 modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala create mode 100644 modules/analysis/src/test/resources/test.ser.gz create mode 100644 modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala create mode 100644 modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala index 75d07eef..44f7203b 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalyser.scala @@ -7,18 +7,21 @@ import docspell.analysis.contact.Contact import docspell.analysis.date.DateFind import docspell.analysis.nlp.PipelineCache import docspell.analysis.nlp.StanfordNerClassifier -import docspell.analysis.nlp.StanfordSettings +import docspell.analysis.nlp.StanfordNerSettings +import docspell.analysis.nlp.StanfordTextClassifier +import docspell.analysis.nlp.TextClassifier import docspell.common._ trait TextAnalyser[F[_]] { def annotate( logger: Logger[F], - settings: StanfordSettings, + settings: StanfordNerSettings, cacheKey: Ident, text: String ): F[TextAnalyser.Result] + def classifier(blocker: Blocker)(implicit CS: ContextShift[F]): TextClassifier[F] } object TextAnalyser { @@ -35,7 +38,7 @@ object TextAnalyser { new TextAnalyser[F] { def annotate( logger: Logger[F], - settings: StanfordSettings, + settings: StanfordNerSettings, cacheKey: Ident, text: String ): F[TextAnalyser.Result] = @@ -48,6 +51,11 @@ object TextAnalyser { spans = NerLabelSpan.build(list) } yield Result(spans ++ list, dates) + def classifier(blocker: Blocker)(implicit + CS: ContextShift[F] + ): TextClassifier[F] = + new StanfordTextClassifier[F](cfg.classifier, blocker) + private def textLimit(logger: Logger[F], text: String): F[String] = if (text.length <= cfg.maxLength) text.pure[F] else @@ -56,7 +64,7 @@ object TextAnalyser { s" Analysing only first ${cfg.maxLength} characters." ) *> text.take(cfg.maxLength).pure[F] - private def stanfordNer(key: Ident, settings: StanfordSettings, text: String) + private def stanfordNer(key: Ident, settings: StanfordNerSettings, text: String) : F[Vector[NerLabel]] = StanfordNerClassifier.nerAnnotate[F](key.id, cache)(settings, text) diff --git a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala index 577f6753..596a6247 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/TextAnalysisConfig.scala @@ -1,5 +1,8 @@ package docspell.analysis +import docspell.analysis.nlp.TextClassifierConfig + case class TextAnalysisConfig( - maxLength: Int + maxLength: Int, + classifier: TextClassifierConfig ) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala new file mode 100644 index 00000000..82f9f9cc --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/ClassifierModel.scala @@ -0,0 +1,5 @@ +package docspell.analysis.nlp + +import java.nio.file.Path + +case class ClassifierModel(model: Path) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala index 9787563f..88e13ee3 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/PipelineCache.scala @@ -19,7 +19,7 @@ import org.log4s.getLogger */ trait PipelineCache[F[_]] { - def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] + def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] } @@ -28,7 +28,7 @@ object PipelineCache { def none[F[_]: Applicative]: PipelineCache[F] = new PipelineCache[F] { - def obtain(ignored: String, settings: StanfordSettings): F[StanfordCoreNLP] = + def obtain(ignored: String, settings: StanfordNerSettings): F[StanfordCoreNLP] = makeClassifier(settings).pure[F] } @@ -38,7 +38,7 @@ object PipelineCache { final private class Impl[F[_]: Sync](data: Ref[F, Map[String, Entry]]) extends PipelineCache[F] { - def obtain(key: String, settings: StanfordSettings): F[StanfordCoreNLP] = + def obtain(key: String, settings: StanfordNerSettings): F[StanfordCoreNLP] = for { id <- makeSettingsId(settings) nlp <- data.modify(cache => getOrCreate(key, id, cache, settings)) @@ -48,7 +48,7 @@ object PipelineCache { key: String, id: String, cache: Map[String, Entry], - settings: StanfordSettings + settings: StanfordNerSettings ): (Map[String, Entry], StanfordCoreNLP) = cache.get(key) match { case Some(entry) => @@ -68,7 +68,7 @@ object PipelineCache { (cache.updated(key, e), nlp) } - private def makeSettingsId(settings: StanfordSettings): F[String] = { + private def makeSettingsId(settings: StanfordNerSettings): F[String] = { val base = settings.copy(regexNer = None).toString val size: F[Long] = settings.regexNer match { @@ -81,7 +81,7 @@ object PipelineCache { } } - private def makeClassifier(settings: StanfordSettings): StanfordCoreNLP = { + private def makeClassifier(settings: StanfordNerSettings): StanfordCoreNLP = { logger.info(s"Creating ${settings.lang.name} Stanford NLP NER classifier...") new StanfordCoreNLP(Properties.forSettings(settings)) } diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala index 314f04fb..46a614d1 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala @@ -7,6 +7,9 @@ import docspell.common._ object Properties { + def fromMap(m: Map[String, String]): JProps = + apply(m.toSeq: _*) + def apply(ps: (String, String)*): JProps = { val p = new JProps() for ((k, v) <- ps) @@ -14,7 +17,7 @@ object Properties { p } - def forSettings(settings: StanfordSettings): JProps = { + def forSettings(settings: StanfordNerSettings): JProps = { val regexNerFile = settings.regexNer .map(p => p.normalize().toAbsolutePath().toString()) settings.lang match { diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala index 424396e5..383a07ea 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala @@ -25,7 +25,7 @@ object StanfordNerClassifier { def nerAnnotate[F[_]: Applicative]( cacheKey: String, cache: PipelineCache[F] - )(settings: StanfordSettings, text: String): F[Vector[NerLabel]] = + )(settings: StanfordNerSettings, text: String): F[Vector[NerLabel]] = cache .obtain(cacheKey, settings) .map(crf => runClassifier(crf, text)) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala similarity index 88% rename from modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala rename to modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala index c2f6f98c..06136a18 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordSettings.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerSettings.scala @@ -19,4 +19,8 @@ import docspell.common._ * as a last step to tag untagged tokens using the provided list of * regexps. */ -case class StanfordSettings(lang: Language, highRecall: Boolean, regexNer: Option[Path]) +case class StanfordNerSettings( + lang: Language, + highRecall: Boolean, + regexNer: Option[Path] +) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala new file mode 100644 index 00000000..3da3b5ba --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala @@ -0,0 +1,149 @@ +package docspell.analysis.nlp + +import java.nio.file.Path + +import cats.effect._ +import cats.effect.concurrent.Ref +import cats.implicits._ +import fs2.Stream + +import docspell.analysis.nlp.TextClassifier._ +import docspell.common._ + +import edu.stanford.nlp.classify.ColumnDataClassifier + +final class StanfordTextClassifier[F[_]: Sync: ContextShift]( + cfg: TextClassifierConfig, + blocker: Blocker +) extends TextClassifier[F] { + + def trainClassifier[A]( + logger: Logger[F], + data: Stream[F, Data] + )(handler: TextClassifier.Handler[F, A]): F[A] = + File + .withTempDir(cfg.workingDir, "trainclassifier") + .use { dir => + for { + rawData <- writeDataFile(blocker, dir, data) + _ <- logger.debug(s"Learning from ${rawData.count} items.") + trainData <- splitData(logger, rawData) + scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m)) + sorted = scores.sortBy(-_.score) + res <- handler(sorted.head.model) + } yield res + } + + def classify( + logger: Logger[F], + model: ClassifierModel, + text: String + ): F[Option[String]] = + Sync[F].delay { + val cls = ColumnDataClassifier.getClassifier( + model.model.normalize().toAbsolutePath().toString() + ) + val cat = cls.classOf(cls.makeDatumFromLine(normalisedText(text))) + Option(cat) + } + + // --- helpers + + def train( + logger: Logger[F], + in: TrainData, + props: Map[String, String] + ): F[TrainResult] = + for { + _ <- logger.debug(s"Training classifier from $props") + res <- Sync[F].delay { + val cdc = new ColumnDataClassifier(Properties.fromMap(amendProps(in, props))) + cdc.trainClassifier(in.train.toString()) + val score = cdc.testClassifier(in.test.toString()) + TrainResult(score.first(), ClassifierModel(in.modelFile)) + } + _ <- logger.debug(s"Trained with result $res") + } yield res + + def splitData(logger: Logger[F], in: RawData): F[TrainData] = { + val nTest = (in.count * 0.25).toLong + + val td = + TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt")) + + val fileLines = + fs2.io.file + .readAll(in.file, blocker, 4096) + .through(fs2.text.utf8Decode) + .through(fs2.text.lines) + + for { + _ <- logger.debug( + s"Splitting raw data into test/train data. Testing with $nTest entries" + ) + _ <- + fileLines + .take(nTest) + .intersperse("\n") + .through(fs2.text.utf8Encode) + .through(fs2.io.file.writeAll(td.test, blocker)) + .compile + .drain + _ <- + fileLines + .drop(nTest) + .intersperse("\n") + .through(fs2.text.utf8Encode) + .through(fs2.io.file.writeAll(td.train, blocker)) + .compile + .drain + } yield td + } + + def writeDataFile(blocker: Blocker, dir: Path, data: Stream[F, Data]): F[RawData] = { + val target = dir.resolve("rawdata") + for { + counter <- Ref.of[F, Long](0L) + _ <- + data + .map(d => s"${d.cls}\t${d.ref}\t${normalisedText(d.text)}") + .evalTap(_ => counter.update(_ + 1)) + .intersperse("\n") + .through(fs2.text.utf8Encode) + .through(fs2.io.file.writeAll(target, blocker)) + .compile + .drain + lines <- counter.get + } yield RawData(lines, target) + + } + + def normalisedText(text: String): String = + text.replaceAll("[\n\t]+", " ") + + def amendProps( + trainData: TrainData, + props: Map[String, String] + ): Map[String, String] = + prepend("2", props) ++ Map( + "trainFile" -> trainData.train.normalize().toAbsolutePath().toString(), + "testFile" -> trainData.test.normalize().toAbsolutePath().toString(), + "serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString() + ).toList + + case class RawData(count: Long, file: Path) + case class TrainData(train: Path, test: Path) { + val modelFile = train.resolveSibling("model.ser.gz") + } + + case class TrainResult(score: Double, model: ClassifierModel) + + def prepend(pre: String, data: Map[String, String]): Map[String, String] = + data.toList + .map({ + case (k, v) => + if (k.startsWith(pre)) (k, v) + else (pre + k, v) + }) + .toMap +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala new file mode 100644 index 00000000..f2927d0c --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifier.scala @@ -0,0 +1,25 @@ +package docspell.analysis.nlp + +import cats.data.Kleisli +import fs2.Stream + +import docspell.analysis.nlp.TextClassifier.Data +import docspell.common._ + +trait TextClassifier[F[_]] { + + def trainClassifier[A](logger: Logger[F], data: Stream[F, Data])( + handler: TextClassifier.Handler[F, A] + ): F[A] + + def classify(logger: Logger[F], model: ClassifierModel, text: String): F[Option[String]] + +} + +object TextClassifier { + + type Handler[F[_], A] = Kleisli[F, ClassifierModel, A] + + case class Data(cls: String, ref: String, text: String) + +} diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala new file mode 100644 index 00000000..e3baac46 --- /dev/null +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/TextClassifierConfig.scala @@ -0,0 +1,10 @@ +package docspell.analysis.nlp + +import java.nio.file.Path + +import cats.data.NonEmptyList + +case class TextClassifierConfig( + workingDir: Path, + classifierConfigs: NonEmptyList[Map[String, String]] +) diff --git a/modules/analysis/src/test/resources/test.ser.gz b/modules/analysis/src/test/resources/test.ser.gz new file mode 100644 index 0000000000000000000000000000000000000000..b6d0956ba0f2100bc670502e77717ac428688a9d GIT binary patch literal 1682 zcmV;D25tEtiwFP!000000G(G|h#W-_uG{2t|NlSvAr}RsVnTKaY7`E<+}*m#hRNlU zI}_OzN#9KGZs%ruCfz-^dl8hNMkVn<(FY{~1q~4s6<H7Jqx~jUm_TsP5ECp<%mY0-@@PZ?(B)8q9X%i_~VKHeiPKlTd zmW>j~ACtzusF(Kywb(F|MfK26PrEQJ$b#ZqcfO5d#5J5nu`dQ=VdW%4d%n&~B4C zYWu6>@FXPfC`Lp4pU#@{f1~={yD??1>h%B_#_FQ$x zc7y*^TJft_zf}=!AVwP(f-Gvkswcl1dF={0w+5QBuvW4Y6fJAbwUO4ASS$84S@ZPk zJsYq5{lfX*63}EIz#=yl+w`ZD=wDA2fnRSlh1nH-;!N?;%ldeozq)-4H#V>CqJkqeXF*r6X2IzY`> zEs7%6uK$2_pu)+VP3yt#{qFsbFO{8lv_m?~O;Q(ZGx$I4Vb(fpSxYEr5!9s2RkdQU z2`?tIcu5k<(22CICb&Wk&aisKV7q`bP+E&%%r~I zjX|pQYp1sA=RTpAoAuX*fYt4P_i24k=tK3;`7?v>nz=0zYyy@&D5ekFl}jB~` z_oHyxE6;eY>j;&G;gEjB?`X5psn0=cf!UOkGR;D75h>$bR;UO($wGHgIG&4U146Cc z=Ae_wLzLM5W)Y_4m?srRFAdH1PC<)lX!U=~q*{Ar2-sz_GE>1OsLVm1?G$HRo2et1 zhe^ld3ZapP+vO5-Q_ev*E}5z$wAl{2XkyTF&{KPlqTGol{da@%B5n$o8oQ3viJmk_ zFuqJ;&=t7PIQ`QESveR9$c%PJmq2X z_Cbd%d4&R=cB(@-5U!$3>k7nhN`^tyTvp`hj7iXgAUmkjEgNE&ts~5oh>HQ*R3peE zOqHf*Cqf^n!0M65MI6~QUI&g%acQDryC>KFGOH_@$Y@5!)I?xCD2E&j z;rq0p1y-HG=(1`=*Rltv0vY z{gR<#G!I=t=hDTmO8)edHDIuT7z!&$-VAlN;-p!ba(x(x9fJ=AIE|;=zZdSwLuY-@ z)U<>F1NF2JN3*sWFl5%wWH7=kj*B8X+#d`N`u<7<(L}@Trq-P-5#euXG7sG`1*aNZ^T-^g5MB~4$VGHHKM<-b^d+b1E1aqr-LPY z^bJ1t%!wcNyf^s7^64QKiJsk0?OsJFlnZ6`5YmJnOVgVg0mg3BE8u$5u4~sfPwuEg zYH9>^!^auhY@#ax3A|ehdIG_J c3vL|H?ZIe-e@`Bh+e5s60M;VBa!(Ba0MQ0Ep#T5? literal 0 HcmV?d00001 diff --git a/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala new file mode 100644 index 00000000..b9596923 --- /dev/null +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/StanfordTextClassifierSuite.scala @@ -0,0 +1,76 @@ +package docspell.analysis.nlp + +import minitest._ +import cats.effect._ +import scala.concurrent.ExecutionContext +import java.nio.file.Paths +import cats.data.NonEmptyList +import docspell.common._ +import fs2.Stream +import cats.data.Kleisli +import TextClassifier.Data + +object StanfordTextClassifierSuite extends SimpleTestSuite { + val logger = Logger.log4s[IO](org.log4s.getLogger) + + implicit val CS = IO.contextShift(ExecutionContext.global) + + test("learn from data") { + val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map())) + + val data = + Stream + .emit(Data("invoice", "n", "this is your invoice total $421")) + .repeat + .take(10) + .zip( + Stream + .emit(Data("receipt", "n", "shopping receipt cheese cake bar")) + .repeat + .take(10) + ) + .flatMap({ + case (a, b) => + Stream.emits(Seq(a, b)) + }) + .covary[IO] + + val modelExists = + Blocker[IO].use { blocker => + val classifier = new StanfordTextClassifier[IO](cfg, blocker) + classifier.trainClassifier[Boolean](logger, data)( + Kleisli(result => File.existsNonEmpty[IO](result.model)) + ) + } + assertEquals(modelExists.unsafeRunSync(), true) + } + + test("run classifier") { + val cfg = TextClassifierConfig(Paths.get("target"), NonEmptyList.of(Map())) + val things = for { + dir <- File.withTempDir[IO](Paths.get("target"), "testcls") + blocker <- Blocker[IO] + } yield (dir, blocker) + + things + .use { + case (dir, blocker) => + val classifier = new StanfordTextClassifier[IO](cfg, blocker) + + val modelFile = dir.resolve("test.ser.gz") + for { + _ <- + LenientUri + .fromJava(getClass.getResource("/test.ser.gz")) + .readURL[IO](4096, blocker) + .through(fs2.io.file.writeAll(modelFile, blocker)) + .compile + .drain + model = ClassifierModel(modelFile) + cat <- classifier.classify(logger, model, "there is receipt always") + _ = assertEquals(cat, Some("receipt")) + } yield () + } + .unsafeRunSync() + } +} diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 746f7bac..e09bfd3b 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -298,7 +298,7 @@ docspell.joex { # These settings are used to configure the classifier. If # multiple are given, they are all tried and the "best" is # chosen at the end. See - # https://nlp.stanford.edu/wiki/Software/Classifier/20_Newsgroups + # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html # for more info about these settings. The settings are almost # identical to them, as they yielded best results with *my* # dataset. diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index a90ad61a..cbbb4a33 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -2,7 +2,10 @@ package docspell.joex import java.nio.file.Path +import cats.data.NonEmptyList + import docspell.analysis.TextAnalysisConfig +import docspell.analysis.nlp.TextClassifierConfig import docspell.backend.Config.Files import docspell.common._ import docspell.convert.ConvertConfig @@ -62,7 +65,15 @@ object Config { ) { def textAnalysisConfig: TextAnalysisConfig = - TextAnalysisConfig(maxLength) + TextAnalysisConfig( + maxLength, + TextClassifierConfig( + workingDir, + NonEmptyList + .fromList(classification.classifiers) + .getOrElse(NonEmptyList.of(Map.empty)) + ) + ) def regexNerFileConfig: RegexNerFile.Config = RegexNerFile.Config(regexNer.enabled, workingDir, regexNer.fileCacheTime) diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala new file mode 100644 index 00000000..a161417a --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -0,0 +1,64 @@ +package docspell.joex.learn + +import cats.data.Kleisli +import cats.data.OptionT +import cats.effect._ +import fs2.Stream + +import docspell.analysis.TextAnalyser +import docspell.analysis.nlp.ClassifierModel +import docspell.analysis.nlp.TextClassifier.Data +import docspell.backend.ops.OCollective +import docspell.common._ +import docspell.joex.Config +import docspell.joex.scheduler._ + +object LearnClassifierTask { + + type Args = LearnClassifierArgs + + def apply[F[_]: Sync: ContextShift]( + cfg: Config.TextAnalysis, + blocker: Blocker, + analyser: TextAnalyser[F] + ): Task[F, Args, Unit] = + Task { ctx => + (for { + sett <- findActiveSettings[F](ctx.args.collective, cfg) + data = selectItems( + ctx, + math.min(cfg.classification.itemCount, sett.itemCount), + sett.category.getOrElse("") + ) + _ <- OptionT.liftF( + analyser + .classifier(blocker) + .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx))) + ) + } yield ()) + .getOrElseF(logInactiveWarning(ctx.logger)) + } + + private def handleModel[F[_]]( + ctx: Context[F, Args] + )(trainedModel: ClassifierModel): F[Unit] = + ??? + + private def selectItems[F[_]]( + ctx: Context[F, Args], + max: Int, + category: String + ): Stream[F, Data] = + ??? + + private def findActiveSettings[F[_]: Sync]( + coll: Ident, + cfg: Config.TextAnalysis + ): OptionT[F, OCollective.Classifier] = + ??? + + private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] = + logger.warn( + "Classification is disabled. Check joex config and the collective settings." + ) +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index abbb6870..92975a70 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -4,7 +4,7 @@ import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser -import docspell.analysis.nlp.StanfordSettings +import docspell.analysis.nlp.StanfordNerSettings import docspell.common._ import docspell.joex.analysis.RegexNerFile import docspell.joex.process.ItemData.AttachmentDates @@ -42,7 +42,7 @@ object TextAnalysis { analyser: TextAnalyser[F], nerFile: RegexNerFile[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { - val settings = StanfordSettings(ctx.args.meta.language, false, None) + val settings = StanfordNerSettings(ctx.args.meta.language, false, None) for { customNer <- nerFile.makeFile(ctx.args.meta.collective) sett = settings.copy(regexNer = customNer) From 68bb65572b361592dabc686a50c2a949252e84a5 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 1 Sep 2020 00:21:19 +0200 Subject: [PATCH 03/10] Integrate learn-classifier task into the app --- .../scala/docspell/backend/BackendApp.scala | 4 +-- .../docspell/backend/ops/OCollective.scala | 27 ++++++++++++++++++- .../scala/docspell/joex/JoexAppImpl.scala | 8 ++++++ .../joex/learn/LearnClassifierTask.scala | 16 ++++++++--- .../store/records/RClassifierSetting.scala | 4 +++ 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala index 6ff3c73e..a9572832 100644 --- a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala +++ b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala @@ -52,12 +52,12 @@ object BackendApp { queue <- JobQueue(store) loginImpl <- Login[F](store) signupImpl <- OSignup[F](store) - collImpl <- OCollective[F](store) + joexImpl <- OJoex(JoexClient(httpClient), store) + collImpl <- OCollective[F](store, utStore, joexImpl) sourceImpl <- OSource[F](store) tagImpl <- OTag[F](store) equipImpl <- OEquipment[F](store) orgImpl <- OOrganization(store) - joexImpl <- OJoex(JoexClient(httpClient), store) uploadImpl <- OUpload(store, queue, cfg.files, joexImpl) nodeImpl <- ONode(store) jobImpl <- OJob(store, joexImpl) diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala index 48934016..955a4649 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala @@ -9,8 +9,12 @@ import docspell.backend.ops.OCollective._ import docspell.common._ import docspell.store.queries.QCollective import docspell.store.records._ +import docspell.store.usertask.UserTask +import docspell.store.usertask.UserTaskStore import docspell.store.{AddResult, Store} +import com.github.eikek.calev.CalEvent + trait OCollective[F[_]] { def find(name: Ident): F[Option[RCollective]] @@ -95,7 +99,11 @@ object OCollective { } } - def apply[F[_]: Effect](store: Store[F]): Resource[F, OCollective[F]] = + def apply[F[_]: Effect]( + store: Store[F], + uts: UserTaskStore[F], + joex: OJoex[F] + ): Resource[F, OCollective[F]] = Resource.pure[F, OCollective[F]](new OCollective[F] { def find(name: Ident): F[Option[RCollective]] = store.transact(RCollective.findById(name)) @@ -105,6 +113,23 @@ object OCollective { .transact(RCollective.updateSettings(collective, sett)) .attempt .map(AddResult.fromUpdate) + .flatMap(res => updateLearnClassifierTask(collective, sett) *> res.pure[F]) + + def updateLearnClassifierTask(coll: Ident, sett: Settings) = + for { + id <- Ident.randomId[F] + on = sett.classifier.map(_.enabled).getOrElse(false) + timer = sett.classifier.map(_.schedule).getOrElse(CalEvent.unsafe("")) + ut = UserTask( + id, + LearnClassifierArgs.taskName, + on, + timer, + LearnClassifierArgs(coll) + ) + _ <- uts.updateOneTask(AccountId(coll, LearnClassifierArgs.taskName), ut) + _ <- joex.notifyAllNodes + } yield () def findSettings(collective: Ident): F[Option[OCollective.Settings]] = store.transact(RCollective.getSettings(collective)) diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index 2fa94c25..7c3f57fc 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -14,6 +14,7 @@ import docspell.ftssolr.SolrFtsClient import docspell.joex.analysis.RegexNerFile import docspell.joex.fts.{MigrationTask, ReIndexTask} import docspell.joex.hk._ +import docspell.joex.learn.LearnClassifierTask import docspell.joex.notify._ import docspell.joex.pdfconv.ConvertAllPdfTask import docspell.joex.pdfconv.PdfConvTask @@ -159,6 +160,13 @@ object JoexAppImpl { ConvertAllPdfTask.onCancel[F] ) ) + .withTask( + JobTask.json( + LearnClassifierArgs.taskName, + LearnClassifierTask[F](cfg.textAnalysis, blocker, analyser), + LearnClassifierTask.onCancel[F] + ) + ) .resource psch <- PeriodicScheduler.create( cfg.periodicScheduler, diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index a161417a..6c11fecf 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -12,11 +12,15 @@ import docspell.backend.ops.OCollective import docspell.common._ import docspell.joex.Config import docspell.joex.scheduler._ +import docspell.store.records.RClassifierSetting object LearnClassifierTask { type Args = LearnClassifierArgs + def onCancel[F[_]: Sync]: Task[F, Args, Unit] = + Task.log(_.warn("Cancelling learn-classifier task")) + def apply[F[_]: Sync: ContextShift]( cfg: Config.TextAnalysis, blocker: Blocker, @@ -24,7 +28,7 @@ object LearnClassifierTask { ): Task[F, Args, Unit] = Task { ctx => (for { - sett <- findActiveSettings[F](ctx.args.collective, cfg) + sett <- findActiveSettings[F](ctx, cfg) data = selectItems( ctx, math.min(cfg.classification.itemCount, sett.itemCount), @@ -52,10 +56,16 @@ object LearnClassifierTask { ??? private def findActiveSettings[F[_]: Sync]( - coll: Ident, + ctx: Context[F, Args], cfg: Config.TextAnalysis ): OptionT[F, OCollective.Classifier] = - ??? + if (cfg.classification.enabled) + OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective))) + .filter(_.enabled) + .filter(_.category.nonEmpty) + .map(OCollective.Classifier.fromRecord) + else + OptionT.none private def logInactiveWarning[F[_]: Sync](logger: Logger[F]): F[Unit] = logger.warn( diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala index 671a8d8f..c15f870c 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala @@ -102,5 +102,9 @@ object RClassifierSetting { created ) } + object Classifier { + def fromRecord(r: RClassifierSetting): Classifier = + Classifier(r.enabled, r.schedule, r.itemCount, r.category.some) + } } From 316b490008da457db53cdc34596bdedd49551471 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 1 Sep 2020 07:50:21 +0200 Subject: [PATCH 04/10] Implement learning a text classifier from collective data --- .../analysis/nlp/StanfordTextClassifier.scala | 18 +++-- .../joex/learn/LearnClassifierTask.scala | 52 +++++++++++--- .../docspell/store/impl/DoobieSyntax.scala | 4 +- .../scala/docspell/store/queries/QItem.scala | 71 +++++++++++++++++++ .../store/records/RClassifierSetting.scala | 3 + 5 files changed, 130 insertions(+), 18 deletions(-) diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala index 3da3b5ba..d8846fc4 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordTextClassifier.scala @@ -26,7 +26,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( .use { dir => for { rawData <- writeDataFile(blocker, dir, data) - _ <- logger.debug(s"Learning from ${rawData.count} items.") + _ <- logger.info(s"Learning from ${rawData.count} items.") trainData <- splitData(logger, rawData) scores <- cfg.classifierConfigs.traverse(m => train(logger, trainData, m)) sorted = scores.sortBy(-_.score) @@ -43,7 +43,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( val cls = ColumnDataClassifier.getClassifier( model.model.normalize().toAbsolutePath().toString() ) - val cat = cls.classOf(cls.makeDatumFromLine(normalisedText(text))) + val cat = cls.classOf(cls.makeDatumFromLine("\t\t" + normalisedText(text))) Option(cat) } @@ -66,7 +66,7 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( } yield res def splitData(logger: Logger[F], in: RawData): F[TrainData] = { - val nTest = (in.count * 0.25).toLong + val nTest = (in.count * 0.15).toLong val td = TrainData(in.file.resolveSibling("train.txt"), in.file.resolveSibling("test.txt")) @@ -106,9 +106,10 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( counter <- Ref.of[F, Long](0L) _ <- data - .map(d => s"${d.cls}\t${d.ref}\t${normalisedText(d.text)}") + .filter(_.text.nonEmpty) + .map(d => s"${d.cls}\t${fixRef(d.ref)}\t${normalisedText(d.text)}") .evalTap(_ => counter.update(_ + 1)) - .intersperse("\n") + .intersperse("\r\n") .through(fs2.text.utf8Encode) .through(fs2.io.file.writeAll(target, blocker)) .compile @@ -119,13 +120,16 @@ final class StanfordTextClassifier[F[_]: Sync: ContextShift]( } def normalisedText(text: String): String = - text.replaceAll("[\n\t]+", " ") + text.replaceAll("[\n\r\t]+", " ") + + def fixRef(str: String): String = + str.replace('\t', '_') def amendProps( trainData: TrainData, props: Map[String, String] ): Map[String, String] = - prepend("2", props) ++ Map( + prepend("2.", props) ++ Map( "trainFile" -> trainData.train.normalize().toAbsolutePath().toString(), "testFile" -> trainData.test.normalize().toAbsolutePath().toString(), "serializeTo" -> trainData.modelFile.normalize().toAbsolutePath().toString() diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 6c11fecf..013cd215 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -3,7 +3,8 @@ package docspell.joex.learn import cats.data.Kleisli import cats.data.OptionT import cats.effect._ -import fs2.Stream +import cats.implicits._ +import fs2.{Pipe, Stream} import docspell.analysis.TextAnalyser import docspell.analysis.nlp.ClassifierModel @@ -12,9 +13,13 @@ import docspell.backend.ops.OCollective import docspell.common._ import docspell.joex.Config import docspell.joex.scheduler._ +import docspell.store.queries.QItem import docspell.store.records.RClassifierSetting +import bitpeace.MimetypeHint + object LearnClassifierTask { + val noClass = "__NONE__" type Args = LearnClassifierArgs @@ -31,29 +36,58 @@ object LearnClassifierTask { sett <- findActiveSettings[F](ctx, cfg) data = selectItems( ctx, - math.min(cfg.classification.itemCount, sett.itemCount), + math.min(cfg.classification.itemCount, sett.itemCount).toLong, sett.category.getOrElse("") ) _ <- OptionT.liftF( analyser .classifier(blocker) - .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx))) + .trainClassifier[Unit](ctx.logger, data)(Kleisli(handleModel(ctx, blocker))) ) } yield ()) .getOrElseF(logInactiveWarning(ctx.logger)) } - private def handleModel[F[_]]( - ctx: Context[F, Args] + private def handleModel[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + blocker: Blocker )(trainedModel: ClassifierModel): F[Unit] = - ??? + for { + oldFile <- ctx.store.transact( + RClassifierSetting.findById(ctx.args.collective).map(_.flatMap(_.fileId)) + ) + _ <- ctx.logger.info("Storing new trained model") + fileData = fs2.io.file.readAll(trainedModel.model, blocker, 4096) + newFile <- + ctx.store.bitpeace.saveNew(fileData, 4096, MimetypeHint.none).compile.lastOrError + _ <- ctx.store.transact( + RClassifierSetting.updateFile(ctx.args.collective, Ident.unsafe(newFile.id)) + ) + _ <- ctx.logger.debug(s"New model stored at file ${newFile.id}") + _ <- oldFile match { + case Some(fid) => + ctx.logger.debug(s"Deleting old model file ${fid.id}") *> + ctx.store.bitpeace.delete(fid.id).compile.drain + case None => ().pure[F] + } + } yield () private def selectItems[F[_]]( ctx: Context[F, Args], - max: Int, + max: Long, category: String - ): Stream[F, Data] = - ??? + ): Stream[F, Data] = { + val connStream = + for { + item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max)) + tt <- Stream.eval(QItem.resolveTextAndTag(ctx.args.collective, item, category)) + } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim) + ctx.store.transact(connStream.filter(_.text.nonEmpty)) + } + + private def restrictTo[F[_], A](max: Long): Pipe[F, A, A] = + if (max <= 0) identity + else _.take(max) private def findActiveSettings[F[_]: Sync]( ctx: Context[F, Args], diff --git a/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala b/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala index e4a67538..3a992b71 100644 --- a/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala +++ b/modules/store/src/main/scala/docspell/store/impl/DoobieSyntax.scala @@ -67,8 +67,8 @@ trait DoobieSyntax { Fragment.const(" FROM ") ++ table ++ this.where(where) def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment = - Fragment.const("SELECT DISTINCT(") ++ commas(cols.map(_.f)) ++ - Fragment.const(") FROM ") ++ table ++ this.where(where) + Fragment.const("SELECT DISTINCT ") ++ commas(cols.map(_.f)) ++ + Fragment.const(" FROM ") ++ table ++ this.where(where) def selectCount(col: Column, table: Fragment, where: Fragment): Fragment = Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 1240d4a7..312523ce 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -7,6 +7,7 @@ import cats.effect.concurrent.Ref import cats.implicits._ import fs2.Stream +import docspell.common.syntax.all._ import docspell.common.{IdRef, _} import docspell.store.Store import docspell.store.impl.Implicits._ @@ -615,4 +616,74 @@ object QItem { .query[NameAndNotes] .streamWithChunkSize(chunkSize) } + + def findAllNewesFirst( + collective: Ident, + chunkSize: Int + ): Stream[ConnectionIO, Ident] = { + val cols = Seq(RItem.Columns.id) + (selectSimple(cols, RItem.table, RItem.Columns.cid.is(collective)) ++ + orderBy(RItem.Columns.created.desc)) + .query[Ident] + .streamWithChunkSize(chunkSize) + } + + case class TagName(id: Ident, name: String) + case class TextAndTag(itemId: Ident, text: String, tag: Option[TagName]) + + def resolveTextAndTag( + collective: Ident, + itemId: Ident, + tagCategory: String + ): ConnectionIO[TextAndTag] = { + val aId = RAttachment.Columns.id.prefix("a") + val aItem = RAttachment.Columns.itemId.prefix("a") + val mId = RAttachmentMeta.Columns.id.prefix("m") + val mText = RAttachmentMeta.Columns.content.prefix("m") + val tiItem = RTagItem.Columns.itemId.prefix("ti") + val tiTag = RTagItem.Columns.tagId.prefix("ti") + val tId = RTag.Columns.tid.prefix("t") + val tName = RTag.Columns.name.prefix("t") + val tCat = RTag.Columns.category.prefix("t") + val iId = RItem.Columns.id.prefix("i") + val iColl = RItem.Columns.cid.prefix("i") + + val cte = withCTE( + "tags" -> selectSimple( + Seq(tiItem, tId, tName), + RTagItem.table ++ fr"ti INNER JOIN" ++ + RTag.table ++ fr"t ON" ++ tId.is(tiTag), + and(tiItem.is(itemId), tCat.is(tagCategory)) + ) + ) + + val cols = Seq(mText, tId, tName) + + val from = RItem.table ++ fr"i INNER JOIN" ++ + RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ fr"INNER JOIN" ++ + RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ fr"LEFT JOIN" ++ + fr"tags t ON" ++ RTagItem.Columns.itemId.prefix("t").is(iId) + + val where = + and( + iId.is(itemId), + iColl.is(collective), + mText.isNotNull, + mText.isNot("") + ) + + val q = cte ++ selectDistinct(cols, from, where) + for { + _ <- logger.ftrace[ConnectionIO]( + s"query: $q (${itemId.id}, ${collective.id}, ${tagCategory})" + ) + texts <- q.query[(String, Option[TagName])].to[List] + _ <- logger.ftrace[ConnectionIO]( + s"Got ${texts.size} text and tag entries for item ${itemId.id}" + ) + tag = texts.headOption.flatMap(_._2) + txt = texts.map(_._1).mkString(" --n-- ") + } yield TextAndTag(itemId, txt, tag) + } + } diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala index c15f870c..680741a0 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala @@ -61,6 +61,9 @@ object RClassifierSetting { sql.update.run } + def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] = + updateRow(table, cid.is(coll), fileId.setTo(fid)).update.run + def updateSettings(v: RClassifierSetting): ConnectionIO[Int] = for { n1 <- updateRow( From 237b96062553439cea4c38d3aa7b4a3518aeee1d Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 1 Sep 2020 21:51:57 +0200 Subject: [PATCH 05/10] Guess a tag on item processing using a trained model if available --- .../docspell/joex/process/ItemData.scala | 3 + .../docspell/joex/process/ProcessItem.scala | 4 +- .../docspell/joex/process/TextAnalysis.scala | 59 +++++++++++++++++-- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index d4f83fc2..af9a3db2 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -38,6 +38,9 @@ case class ItemData( copy(metas = next) } + def appendTags(tags: Seq[String]): ItemData = + copy(tags = (this.tags ++ tags.toList).distinct) + def changeMeta( attachId: Ident, f: RAttachmentMeta => RAttachmentMeta diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 7b8b6431..fb777b24 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -34,12 +34,12 @@ object ProcessItem { )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = processAttachments0[F](cfg, fts, analyser, regexNer, (30, 60, 90))(item) - def analysisOnly[F[_]: Sync]( + def analysisOnly[F[_]: Sync: ContextShift]( cfg: Config, analyser: TextAnalyser[F], regexNer: RegexNerFile[F] )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextAnalysis[F](analyser, regexNer)(item) + TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item) .flatMap(FindProposal[F](cfg.processing)) .flatMap(EvalProposals[F]) .flatMap(SaveProposals[F]) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 92975a70..039f52e7 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,23 +1,32 @@ package docspell.joex.process +import cats.data.OptionT import cats.effect._ import cats.implicits._ import docspell.analysis.TextAnalyser +import docspell.analysis.nlp.ClassifierModel import docspell.analysis.nlp.StanfordNerSettings +import docspell.analysis.nlp.TextClassifier import docspell.common._ +import docspell.joex.Config import docspell.joex.analysis.RegexNerFile import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task import docspell.store.records.RAttachmentMeta +import docspell.store.records.RClassifierSetting + +import bitpeace.RangeDef object TextAnalysis { + type Args = ProcessItemArgs - def apply[F[_]: Sync]( + def apply[F[_]: Sync: ContextShift]( + cfg: Config.TextAnalysis, analyser: TextAnalyser[F], nerFile: RegexNerFile[F] - )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = + )(item: ItemData): Task[F, Args, ItemData] = Task { ctx => for { _ <- ctx.logger.info("Starting text analysis") @@ -34,11 +43,14 @@ object TextAnalysis { e <- s _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") v = t.toVector - } yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + tag <- predictTag(ctx, cfg, item.metas, analyser.classifier(ctx.blocker)).value + } yield item + .copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + .appendTags(tag.toSeq) } def annotateAttachment[F[_]: Sync]( - ctx: Context[F, ProcessItemArgs], + ctx: Context[F, Args], analyser: TextAnalyser[F], nerFile: RegexNerFile[F] )(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] = { @@ -54,4 +66,43 @@ object TextAnalysis { ) } yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates)) } + + def predictTag[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis, + metas: Vector[RAttachmentMeta], + classifier: TextClassifier[F] + ): OptionT[F, String] = + for { + model <- findActiveModel(ctx, cfg) + _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …")) + text = metas.flatMap(_.content).mkString(" ------ ") + modelData = + ctx.store.bitpeace + .get(model.id) + .unNoneTerminate + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) + cls <- OptionT(File.withTempDir(cfg.workingDir, "classify").use { dir => + val modelFile = dir.resolve("model.ser.gz") + modelData + .through(fs2.io.file.writeAll(modelFile, ctx.blocker)) + .compile + .drain + .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text)) + + }) + _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}")) + } yield cls + + private def findActiveModel[F[_]: Sync]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis + ): OptionT[F, Ident] = + if (cfg.classification.enabled) + OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.meta.collective))) + .filter(_.enabled) + .mapFilter(_.fileId) + else + OptionT.none + } From 8677eca6d4af715259b1d6897b8a587b4f089782 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 1 Sep 2020 21:59:31 +0200 Subject: [PATCH 06/10] Fix setting default in dropdown --- .../webapp/src/main/elm/Comp/ClassifierSettingsForm.elm | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm index ef6a7638..23e440cd 100644 --- a/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm +++ b/modules/webapp/src/main/elm/Comp/ClassifierSettingsForm.elm @@ -54,7 +54,7 @@ init flags sett = in ( { enabled = sett.enabled , categoryModel = Comp.FixedDropdown.initString [] - , category = Nothing + , category = sett.category , scheduleModel = cem , schedule = Data.Validated.Unknown newSchedule , itemCountModel = Comp.IntField.init (Just 0) Nothing True "Item Count" @@ -92,7 +92,12 @@ update flags msg model = in ( { model | categoryModel = Comp.FixedDropdown.initString categories - , category = List.head categories + , category = + if model.category == Nothing then + List.head categories + + else + model.category } , Cmd.none ) From f9fcee81a5f141c51b2d93cadd01e98b996b8d65 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 1 Sep 2020 23:56:57 +0200 Subject: [PATCH 07/10] Add start-now button for train-classifier task --- .../scala/docspell/backend/BackendApp.scala | 2 +- .../docspell/backend/ops/OCollective.scala | 20 +++++- .../src/main/resources/docspell-openapi.yml | 22 +++++++ .../restserver/routes/CollectiveRoutes.scala | 6 ++ modules/webapp/src/main/elm/Api.elm | 14 ++++ .../main/elm/Comp/CollectiveSettingsForm.elm | 66 +++++++++++++++---- 6 files changed, 115 insertions(+), 15 deletions(-) diff --git a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala index a9572832..be76d45b 100644 --- a/modules/backend/src/main/scala/docspell/backend/BackendApp.scala +++ b/modules/backend/src/main/scala/docspell/backend/BackendApp.scala @@ -53,7 +53,7 @@ object BackendApp { loginImpl <- Login[F](store) signupImpl <- OSignup[F](store) joexImpl <- OJoex(JoexClient(httpClient), store) - collImpl <- OCollective[F](store, utStore, joexImpl) + collImpl <- OCollective[F](store, utStore, queue, joexImpl) sourceImpl <- OSource[F](store) tagImpl <- OTag[F](store) equipImpl <- OEquipment[F](store) diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala index 955a4649..5e9b5aaf 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OCollective.scala @@ -8,12 +8,13 @@ import docspell.backend.PasswordCrypt import docspell.backend.ops.OCollective._ import docspell.common._ import docspell.store.queries.QCollective +import docspell.store.queue.JobQueue import docspell.store.records._ import docspell.store.usertask.UserTask import docspell.store.usertask.UserTaskStore import docspell.store.{AddResult, Store} -import com.github.eikek.calev.CalEvent +import com.github.eikek.calev._ trait OCollective[F[_]] { @@ -49,6 +50,7 @@ trait OCollective[F[_]] { def findEnabledSource(sourceId: Ident): F[Option[RSource]] + def startLearnClassifier(collective: Ident): F[Unit] } object OCollective { @@ -102,6 +104,7 @@ object OCollective { def apply[F[_]: Effect]( store: Store[F], uts: UserTaskStore[F], + queue: JobQueue[F], joex: OJoex[F] ): Resource[F, OCollective[F]] = Resource.pure[F, OCollective[F]](new OCollective[F] { @@ -131,6 +134,21 @@ object OCollective { _ <- joex.notifyAllNodes } yield () + def startLearnClassifier(collective: Ident): F[Unit] = + for { + id <- Ident.randomId[F] + ut <- UserTask( + id, + LearnClassifierArgs.taskName, + true, + CalEvent(WeekdayComponent.All, DateEvent.All, TimeEvent.All), + LearnClassifierArgs(collective) + ).encode.toPeriodicTask(AccountId(collective, LearnClassifierArgs.taskName)) + job <- ut.toJob + _ <- queue.insert(job) + _ <- joex.notifyAllNodes + } yield () + def findSettings(collective: Ident): F[Option[OCollective.Settings]] = store.transact(RCollective.getSettings(collective)) diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index 1a20db8d..a03a0e2e 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -1047,6 +1047,28 @@ paths: application/json: schema: $ref: "#/components/schemas/ContactList" + + /sec/collective/classifier/startonce: + post: + tags: [ Collective ] + summary: Starts the learn-classifier task + description: | + If the collective has classification enabled, this will submit + the task for learning a classifier from existing data. This + task is usally run periodically as determined by the + collective settings. + + The request is empty, settings are used from the collective. + security: + - authTokenHeader: [] + responses: + 200: + description: Ok + content: + application/json: + schema: + $ref: "#/components/schemas/BasicResult" + /sec/user: get: tags: [ Collective ] diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala index 2aed289f..bf7eaddd 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/CollectiveRoutes.scala @@ -88,6 +88,12 @@ object CollectiveRoutes { resp <- Ok(ContactList(res.map(Conversions.mkContact))) } yield resp + case POST -> Root / "classifier" / "startonce" => + for { + _ <- backend.collective.startLearnClassifier(user.account.collective) + resp <- Ok(BasicResult(true, "Task submitted")) + } yield resp + case GET -> Root => for { collDb <- backend.collective.find(user.account.collective) diff --git a/modules/webapp/src/main/elm/Api.elm b/modules/webapp/src/main/elm/Api.elm index 10bcf7ff..ccba8570 100644 --- a/modules/webapp/src/main/elm/Api.elm +++ b/modules/webapp/src/main/elm/Api.elm @@ -88,6 +88,7 @@ module Api exposing , setItemNotes , setTags , setUnconfirmed + , startClassifier , startOnceNotifyDueItems , startOnceScanMailbox , startReIndex @@ -795,6 +796,19 @@ versionInfo flags receive = --- Collective +startClassifier : + Flags + -> (Result Http.Error BasicResult -> msg) + -> Cmd msg +startClassifier flags receive = + Http2.authPost + { url = flags.config.baseUrl ++ "/api/v1/sec/collective/classifier/startonce" + , account = getAccount flags + , body = Http.emptyBody + , expect = Http.expectJson receive Api.Model.BasicResult.decoder + } + + getTagCloud : Flags -> (Result Http.Error TagCloud -> msg) -> Cmd msg getTagCloud flags receive = Http2.authGet diff --git a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm index 87696d85..1efef12d 100644 --- a/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm +++ b/modules/webapp/src/main/elm/Comp/CollectiveSettingsForm.elm @@ -30,6 +30,7 @@ type alias Model = , fullTextConfirmText : String , fullTextReIndexResult : Maybe BasicResult , classifierModel : Comp.ClassifierSettingsForm.Model + , startClassifierResult : Maybe BasicResult } @@ -60,6 +61,7 @@ init flags settings = , fullTextConfirmText = "" , fullTextReIndexResult = Nothing , classifierModel = cm + , startClassifierResult = Nothing } , Cmd.map ClassifierSettingMsg cc ) @@ -91,6 +93,8 @@ type Msg | TriggerReIndexResult (Result Http.Error BasicResult) | ClassifierSettingMsg Comp.ClassifierSettingsForm.Msg | SaveSettings + | StartClassifierTask + | StartClassifierResp (Result Http.Error BasicResult) update : Flags -> Msg -> Model -> ( Model, Cmd Msg, Maybe CollectiveSettings ) @@ -169,12 +173,30 @@ update flags msg model = _ -> ( model, Cmd.none, Nothing ) + StartClassifierTask -> + ( model, Api.startClassifier flags StartClassifierResp, Nothing ) + + StartClassifierResp (Ok br) -> + ( { model | startClassifierResult = Just br } + , Cmd.none + , Nothing + ) + + StartClassifierResp (Err err) -> + ( { model + | startClassifierResult = + Just (BasicResult False (Util.Http.errorToString err)) + } + , Cmd.none + , Nothing + ) + view : Flags -> UiSettings -> Model -> Html Msg view flags settings model = div [ classList - [ ( "ui form", True ) + [ ( "ui form error success", True ) , ( "error", Maybe.map .success model.fullTextReIndexResult == Just False ) , ( "success", Maybe.map .success model.fullTextReIndexResult == Just True ) ] @@ -250,18 +272,7 @@ view flags settings model = [ text "This starts a task that clears the full-text index and re-indexes all your data again." , text "You must type OK before clicking the button to avoid accidental re-indexing." ] - , div - [ classList - [ ( "ui message", True ) - , ( "error", Maybe.map .success model.fullTextReIndexResult == Just False ) - , ( "success", Maybe.map .success model.fullTextReIndexResult == Just True ) - , ( "hidden invisible", model.fullTextReIndexResult == Nothing ) - ] - ] - [ Maybe.map .message model.fullTextReIndexResult - |> Maybe.withDefault "" - |> text - ] + , renderResultMessage model.fullTextReIndexResult ] , h3 [ classList @@ -279,6 +290,19 @@ view flags settings model = ] [ Html.map ClassifierSettingMsg (Comp.ClassifierSettingsForm.view model.classifierModel) + , div [ class "ui vertical segment" ] + [ button + [ classList + [ ( "ui small secondary basic button", True ) + , ( "disabled", not model.classifierModel.enabled ) + ] + , title "Starts a task to train a classifier" + , onClick StartClassifierTask + ] + [ text "Start now" + ] + , renderResultMessage model.startClassifierResult + ] ] , div [ class "ui divider" ] [] , button @@ -291,3 +315,19 @@ view flags settings model = [ text "Save" ] ] + + +renderResultMessage : Maybe BasicResult -> Html msg +renderResultMessage result = + div + [ classList + [ ( "ui message", True ) + , ( "error", Maybe.map .success result == Just False ) + , ( "success", Maybe.map .success result == Just True ) + , ( "hidden invisible", result == Nothing ) + ] + ] + [ Maybe.map .message result + |> Maybe.withDefault "" + |> text + ] From 4309bd8dfd6976697dbbe74ec600d53140c767ec Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 1 Sep 2020 23:57:27 +0200 Subject: [PATCH 08/10] Some cleanup --- modules/joex/src/main/resources/reference.conf | 6 +++--- .../scala/docspell/joex/learn/LearnClassifierTask.scala | 5 ++++- .../src/main/scala/docspell/joex/process/TextAnalysis.scala | 6 +++--- .../store/src/main/scala/docspell/store/queries/QItem.scala | 5 +++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index e09bfd3b..23ec5b47 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -299,9 +299,8 @@ docspell.joex { # multiple are given, they are all tried and the "best" is # chosen at the end. See # https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html - # for more info about these settings. The settings are almost - # identical to them, as they yielded best results with *my* - # dataset. + # for more info about these settings. The settings here yielded + # good results with *my* dataset. # # Enclose regexps in triple quotes. classifiers = [ @@ -312,6 +311,7 @@ docspell.joex { "maxNGramLeng" = "4" "minNGramLeng" = "1" "splitWordShape" = "chris4" + "intern" = "true" # makes it slower but saves memory } ] } diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 013cd215..c3d6e3f9 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -20,6 +20,7 @@ import bitpeace.MimetypeHint object LearnClassifierTask { val noClass = "__NONE__" + val pageSep = " --n-- " type Args = LearnClassifierArgs @@ -80,7 +81,9 @@ object LearnClassifierTask { val connStream = for { item <- QItem.findAllNewesFirst(ctx.args.collective, 10).through(restrictTo(max)) - tt <- Stream.eval(QItem.resolveTextAndTag(ctx.args.collective, item, category)) + tt <- Stream.eval( + QItem.resolveTextAndTag(ctx.args.collective, item, category, pageSep) + ) } yield Data(tt.tag.map(_.name).getOrElse(noClass), item.id, tt.text.trim) ctx.store.transact(connStream.filter(_.text.nonEmpty)) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 039f52e7..ebb0894a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -11,6 +11,7 @@ import docspell.analysis.nlp.TextClassifier import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile +import docspell.joex.learn.LearnClassifierTask import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Context import docspell.joex.scheduler.Task @@ -76,7 +77,7 @@ object TextAnalysis { for { model <- findActiveModel(ctx, cfg) _ <- OptionT.liftF(ctx.logger.info(s"Guessing tag …")) - text = metas.flatMap(_.content).mkString(" ------ ") + text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) modelData = ctx.store.bitpeace .get(model.id) @@ -89,8 +90,7 @@ object TextAnalysis { .compile .drain .flatMap(_ => classifier.classify(ctx.logger, ClassifierModel(modelFile), text)) - - }) + }).filter(_ != LearnClassifierTask.noClass) _ <- OptionT.liftF(ctx.logger.debug(s"Guessed tag: ${cls}")) } yield cls diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 312523ce..d3d2653e 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -634,7 +634,8 @@ object QItem { def resolveTextAndTag( collective: Ident, itemId: Ident, - tagCategory: String + tagCategory: String, + pageSep: String ): ConnectionIO[TextAndTag] = { val aId = RAttachment.Columns.id.prefix("a") val aItem = RAttachment.Columns.itemId.prefix("a") @@ -682,7 +683,7 @@ object QItem { s"Got ${texts.size} text and tag entries for item ${itemId.id}" ) tag = texts.headOption.flatMap(_._2) - txt = texts.map(_._1).mkString(" --n-- ") + txt = texts.map(_._1).mkString(pageSep) } yield TextAndTag(itemId, txt, tag) } From 145c3084614f199269c64b92a6dcaf1f79d950ec Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 2 Sep 2020 00:18:55 +0200 Subject: [PATCH 09/10] Update documentation --- website/elm/Feature.elm | 2 +- website/site/content/docs/webapp/metadata.md | 25 ++++++++++++++++---- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/website/elm/Feature.elm b/website/elm/Feature.elm index 246aa7ad..4d2fb734 100644 --- a/website/elm/Feature.elm +++ b/website/elm/Feature.elm @@ -67,7 +67,7 @@ Text is extracted from all files. For scanned documents/images, OCR is used by u , { image = "img/analyze-feature.png" , header = "Text Analysis" , description = """ -The extracted text is analyzed and is used to find properties that can be annotated to your documents automatically. +The extracted text is analyzed using ML techniques to find properties that can be annotated to your documents automatically. """ } , { image = "img/filetype-feature.svg" diff --git a/website/site/content/docs/webapp/metadata.md b/website/site/content/docs/webapp/metadata.md index 36e5d57c..0f5e23b2 100644 --- a/website/site/content/docs/webapp/metadata.md +++ b/website/site/content/docs/webapp/metadata.md @@ -33,11 +33,26 @@ workflows, a tag category *state* may exist that includes tags like "assignment" semantics. Docspell doesn't propose any workflow, but it can help to implement some. -The tags are *not* taken into account when creating suggestions from -analyzed text yet. However, PDF files may contain metadata itself and -if there is a metadata *keywords* list, these keywords are matched -against the tags in the database. If they match, the item is tagged -automatically. +Docspell can try to predict a tag for new incoming documents +automatically based on your existing data. This requires to train an +algorithm. There are some caveats: the more data you have correctly +tagged, the better are the results. So it won't work well for maybe +the first 100 documents. Then the tags must somehow relate to a +pattern in the document text. Tags like *todo* or *waiting* probably +won't work, obviously. But the typical "document type" tag, like +*invoice* and *receipt* is a good fit! That is why you need to provide +a tag category so only sensible tags are being learned. The algorithm +goes through all your items and learns patterns in the text that +relate to the given tags. This training step can be run periodically, +as specified in your collective settings such that docspell keeps +learning from your already tagged data! More information about the +algorithm can be found in the config, where it is possible to +fine-tune this process. + +Another way to have items tagged automatically is when an input PDF +file contains a list of keywords in its metadata section (this only +applies to PDF files). These keywords are then matched against the +tags in the database. If they match, the item is tagged with them. ## Organization and Person From afbe9554b6b14c60d6e4395ca18397c05a21b15c Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 2 Sep 2020 22:23:08 +0200 Subject: [PATCH 10/10] Update joex nixos module --- nix/module-joex.nix | 68 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/nix/module-joex.nix b/nix/module-joex.nix index d550c2d3..7619711f 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -95,6 +95,21 @@ let enabled = true; file-cache-time = "1 minute"; }; + classification = { + enabled = true; + item-count = 0; + classifiers = [ + { "useSplitWords" = "true"; + "splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.''; + "splitWordsIgnoreRegexp" = ''\s+''; + "useSplitPrefixSuffixNGrams" = "true"; + "maxNGramLeng" = "4"; + "minNGramLeng" = "1"; + "splitWordShape" = "chris4"; + "intern" = "true"; + } + ]; + }; working-dir = "/tmp/docspell-analysis"; }; processing = { @@ -736,6 +751,59 @@ in { default = defaults.text-analysis.regex-ner; description = ""; }; + + classification = mkOption { + type = types.submodule({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.text-analysis.classification.enabled; + description = '' + Whether to enable classification globally. Each collective can + decide to disable it. If it is disabled here, no collective + can use classification. + ''; + }; + item-count = mkOption { + type = types.int; + default = defaults.text-analysis.classification.item-count; + description = '' + If concerned with memory consumption, this restricts the + number of items to consider. More are better for training. A + negative value or zero means no train on all items. + ''; + }; + classifiers = mkOption { + type = types.listOf types.attrs; + default = defaults.text-analysis.classification.classifiers; + description = '' + These settings are used to configure the classifier. If + multiple are given, they are all tried and the "best" is + chosen at the end. See + https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html + for more info about these settings. The settings here yielded + good results with *my* dataset. + ''; + }; + + }; + }); + default = defaults.text-analysis.classification; + description = '' + Settings for doing document classification. + + This works by learning from existing documents. A collective can + specify a tag category and the system will try to predict a tag + from this category for new incoming documents. + + This requires a satstical model that is computed from all + existing documents. This process is run periodically as + configured by the collective. It may require a lot of memory, + depending on the amount of data. + + It utilises this NLP library: https://nlp.stanford.edu/. + ''; + }; }; }); default = defaults.text-analysis;