From 1cd34414628eeca481f61b5711aa6aa7c8c0557a Mon Sep 17 00:00:00 2001 From: Eike Kettner <eike.kettner@posteo.de> Date: Tue, 19 Jan 2021 22:04:13 +0100 Subject: [PATCH] Run classifier for item entities (concerned, correspondent) Store the results separately from nlp results in attachment metadata. --- .../scala/docspell/joex/learn/Classify.scala | 9 ++- .../joex/learn/LearnClassifierTask.scala | 2 +- .../joex/process/AttachmentPageCount.scala | 1 + .../docspell/joex/process/CreateItem.scala | 6 +- .../docspell/joex/process/ItemData.scala | 8 +- .../docspell/joex/process/ReProcessItem.scala | 3 +- .../docspell/joex/process/SaveProposals.scala | 5 +- .../docspell/joex/process/TextAnalysis.scala | 81 +++++++++++++++---- .../h2/V1.19.0__add_classify_meta.sql | 3 + .../mariadb/V1.19.0__add_classify_meta.sql | 3 + .../postgresql/V1.19.0__add_classify_meta.sql | 3 + .../store/records/RAttachmentMeta.scala | 42 +++++++--- .../store/records/RClassifierSetting.scala | 2 +- 13 files changed, 131 insertions(+), 37 deletions(-) create mode 100644 modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql diff --git a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala index ae34d18f..4c65556c 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/Classify.scala @@ -1,15 +1,18 @@ package docspell.joex.learn import java.nio.file.Path -import cats.implicits._ -import bitpeace.RangeDef + import cats.data.OptionT import cats.effect._ -import docspell.store.Store +import cats.implicits._ + import docspell.analysis.classifier.{ClassifierModel, TextClassifier} import docspell.common._ +import docspell.store.Store import docspell.store.records.RClassifierModel +import bitpeace.RangeDef + object Classify { def apply[F[_]: Sync: ContextShift]( diff --git a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala index 354a8e39..e3aae66f 100644 --- a/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala +++ b/modules/joex/src/main/scala/docspell/joex/learn/LearnClassifierTask.scala @@ -89,7 +89,7 @@ object LearnClassifierTask { ): OptionT[F, OCollective.Classifier] = if (cfg.classification.enabled) OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective))) - .filter(_.enabled) + .filter(_.autoTagEnabled) .map(OCollective.Classifier.fromRecord) else OptionT.none diff --git a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala index 0373db8a..15678322 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala @@ -84,6 +84,7 @@ object AttachmentPageCount { Nil, MetaProposalList.empty, md.pageCount.some, + None, None ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index fe21203b..8bc9ccc1 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -107,7 +107,8 @@ object CreateItem { Vector.empty, fm.map(a => a.id -> a.fileId).toMap, MetaProposalList.empty, - Nil + Nil, + None ) } @@ -166,7 +167,8 @@ object CreateItem { Vector.empty, origMap, MetaProposalList.empty, - Nil + Nil, + None ) ) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index 0435e37c..a151e8a6 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -15,6 +15,9 @@ import docspell.store.records.{RAttachment, RAttachmentMeta, RItem} * containng the source or origin file * @param givenMeta meta data to this item that was not "guessed" * from an attachment but given and thus is always correct + * @param classifyProposals these are proposals that were obtained by + * a trained classifier. There are no ner-tags, it will only provide a + * single label */ case class ItemData( item: RItem, @@ -23,7 +26,10 @@ case class ItemData( dateLabels: Vector[AttachmentDates], originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id givenMeta: MetaProposalList, // given meta data not associated to a specific attachment - tags: List[String] // a list of tags (names or ids) attached to the item if they exist + // a list of tags (names or ids) attached to the item if they exist + tags: List[String], + // proposals obtained from the classifier + classifyProposals: Option[MetaProposalList] ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala index 07fb2901..db41e901 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala @@ -65,7 +65,8 @@ object ReProcessItem { Vector.empty, asrcMap.view.mapValues(_.fileId).toMap, MetaProposalList.empty, - Nil + Nil, + None )).getOrElseF( Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}")) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala index ee4fd923..9d2f0ae3 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/SaveProposals.scala @@ -17,7 +17,10 @@ object SaveProposals { data.metas .traverse(rm => ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *> - ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals)) + ctx.store.transact( + RAttachmentMeta + .updateProposals(rm.id, rm.proposals, data.classifyProposals) + ) ) .map(_ => data) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index b2d50f75..a2561e07 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -1,9 +1,12 @@ package docspell.joex.process +import cats.Traverse import cats.effect._ import cats.implicits._ + import docspell.analysis.classifier.TextClassifier import docspell.analysis.{NlpSettings, TextAnalyser} +import docspell.common.MetaProposal.Candidate import docspell.common._ import docspell.joex.Config import docspell.joex.analysis.RegexNerFile @@ -37,12 +40,22 @@ object TextAnalysis { e <- s _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") v = t.toVector - classifierEnabled <- getActive(ctx, cfg) + autoTagEnabled <- getActiveAutoTag(ctx, cfg) tag <- - if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier) + if (autoTagEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier) else List.empty[String].pure[F] + + classProposals <- + if (cfg.classification.enabled) + predictItemEntities(ctx, cfg, item.metas, analyser.classifier) + else MetaProposalList.empty.pure[F] + } yield item - .copy(metas = v.map(_._1), dateLabels = v.map(_._2)) + .copy( + metas = v.map(_._1), + dateLabels = v.map(_._2), + classifyProposals = classProposals.some + ) .appendTags(tag) } @@ -72,15 +85,8 @@ object TextAnalysis { ): F[List[String]] = { val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) val classifyWith: ClassifierName => F[Option[String]] = - Classify[F]( - ctx.blocker, - ctx.logger, - cfg.workingDir, - ctx.store, - classifier, - ctx.args.meta.collective, - text - ) + makeClassify(ctx, cfg, classifier)(text) + for { names <- ctx.store.transact( ClassifierName.findTagClassifiers(ctx.args.meta.collective) @@ -90,14 +96,61 @@ object TextAnalysis { } yield tags.flatten } - private def getActive[F[_]: Sync]( + def predictItemEntities[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis, + metas: Vector[RAttachmentMeta], + classifier: TextClassifier[F] + ): F[MetaProposalList] = { + val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) + + def classifyWith( + cname: ClassifierName, + mtype: MetaProposalType + ): F[Option[MetaProposal]] = + for { + _ <- ctx.logger.debug(s"Guessing $mtype using classifier") + label <- makeClassify(ctx, cfg, classifier)(text).apply(cname) + } yield label.map(str => + MetaProposal(mtype, Candidate(IdRef(Ident.unsafe(""), str), Set.empty)) + ) + + Traverse[List] + .sequence( + List( + classifyWith(ClassifierName.correspondentOrg, MetaProposalType.CorrOrg), + classifyWith(ClassifierName.correspondentPerson, MetaProposalType.CorrPerson), + classifyWith(ClassifierName.concernedPerson, MetaProposalType.ConcPerson), + classifyWith(ClassifierName.concernedEquip, MetaProposalType.ConcEquip) + ) + ) + .map(_.flatten) + .map(MetaProposalList.apply) + } + + private def makeClassify[F[_]: Sync: ContextShift]( + ctx: Context[F, Args], + cfg: Config.TextAnalysis, + classifier: TextClassifier[F] + )(text: String): ClassifierName => F[Option[String]] = + Classify[F]( + ctx.blocker, + ctx.logger, + cfg.workingDir, + ctx.store, + classifier, + ctx.args.meta.collective, + text + ) + + private def getActiveAutoTag[F[_]: Sync]( ctx: Context[F, Args], cfg: Config.TextAnalysis ): F[Boolean] = if (cfg.classification.enabled) ctx.store .transact(RClassifierSetting.findById(ctx.args.meta.collective)) - .map(_.exists(_.enabled)) + .map(_.exists(_.autoTagEnabled)) .flatTap(enabled => if (enabled) ().pure[F] else ctx.logger.info("Classification is disabled. Check config or settings.") diff --git a/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql new file mode 100644 index 00000000..2513dc8d --- /dev/null +++ b/modules/store/src/main/resources/db/migration/h2/V1.19.0__add_classify_meta.sql @@ -0,0 +1,3 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "classify_proposals" text; + diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql new file mode 100644 index 00000000..fdc3c9f0 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.19.0__add_classify_meta.sql @@ -0,0 +1,3 @@ +ALTER TABLE `attachmentmeta` +ADD COLUMN (`classify_proposals` mediumtext); + diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql new file mode 100644 index 00000000..2513dc8d --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.19.0__add_classify_meta.sql @@ -0,0 +1,3 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "classify_proposals" text; + diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala index 919a5b17..f201525c 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala @@ -16,7 +16,8 @@ case class RAttachmentMeta( nerlabels: List[NerLabel], proposals: MetaProposalList, pages: Option[Int], - language: Option[Language] + language: Option[Language], + classifyProposals: Option[MetaProposalList] ) { def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = @@ -29,19 +30,28 @@ case class RAttachmentMeta( object RAttachmentMeta { def empty(attachId: Ident, lang: Language) = - RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang)) + RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang), None) final case class Table(alias: Option[String]) extends TableDef { val tableName = "attachmentmeta" - val id = Column[Ident]("attachid", this) - val content = Column[String]("content", this) - val nerlabels = Column[List[NerLabel]]("nerlabels", this) - val proposals = Column[MetaProposalList]("itemproposals", this) - val pages = Column[Int]("page_count", this) - val language = Column[Language]("language", this) + val id = Column[Ident]("attachid", this) + val content = Column[String]("content", this) + val nerlabels = Column[List[NerLabel]]("nerlabels", this) + val proposals = Column[MetaProposalList]("itemproposals", this) + val pages = Column[Int]("page_count", this) + val language = Column[Language]("language", this) + val classifyProposals = Column[MetaProposalList]("classify_proposals", this) val all = - NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language) + NonEmptyList.of[Column[_]]( + id, + content, + nerlabels, + proposals, + pages, + language, + classifyProposals + ) } val T = Table(None) @@ -52,7 +62,7 @@ object RAttachmentMeta { DML.insert( T, T.all, - fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}" + fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language},${v.classifyProposals}" ) def exists(attachId: Ident): ConnectionIO[Boolean] = @@ -80,7 +90,8 @@ object RAttachmentMeta { DML.set( T.content.setTo(v.content), T.nerlabels.setTo(v.nerlabels), - T.proposals.setTo(v.proposals) + T.proposals.setTo(v.proposals), + T.classifyProposals.setTo(v.classifyProposals) ) ) @@ -93,12 +104,17 @@ object RAttachmentMeta { ) ) - def updateProposals(mid: Ident, plist: MetaProposalList): ConnectionIO[Int] = + def updateProposals( + mid: Ident, + plist: MetaProposalList, + clist: Option[MetaProposalList] + ): ConnectionIO[Int] = DML.update( T, T.id === mid, DML.set( - T.proposals.setTo(plist) + T.proposals.setTo(plist), + T.classifyProposals.setTo(clist) ) ) diff --git a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala index 9c31a5c2..1d7fd5f6 100644 --- a/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala +++ b/modules/store/src/main/scala/docspell/store/records/RClassifierSetting.scala @@ -20,7 +20,7 @@ case class RClassifierSetting( listType: ListType ) { - def enabled: Boolean = + def autoTagEnabled: Boolean = listType match { case ListType.Blacklist => true