Run classifier for item entities (concerned, correspondent)

Store the results separately from nlp results in attachment metadata.
This commit is contained in:
Eike Kettner 2021-01-19 22:04:13 +01:00
parent d124f0c1a9
commit 1cd3441462
13 changed files with 131 additions and 37 deletions

View File

@ -1,15 +1,18 @@
package docspell.joex.learn package docspell.joex.learn
import java.nio.file.Path import java.nio.file.Path
import cats.implicits._
import bitpeace.RangeDef
import cats.data.OptionT import cats.data.OptionT
import cats.effect._ import cats.effect._
import docspell.store.Store import cats.implicits._
import docspell.analysis.classifier.{ClassifierModel, TextClassifier} import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
import docspell.common._ import docspell.common._
import docspell.store.Store
import docspell.store.records.RClassifierModel import docspell.store.records.RClassifierModel
import bitpeace.RangeDef
object Classify { object Classify {
def apply[F[_]: Sync: ContextShift]( def apply[F[_]: Sync: ContextShift](

View File

@ -89,7 +89,7 @@ object LearnClassifierTask {
): OptionT[F, OCollective.Classifier] = ): OptionT[F, OCollective.Classifier] =
if (cfg.classification.enabled) if (cfg.classification.enabled)
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective))) OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
.filter(_.enabled) .filter(_.autoTagEnabled)
.map(OCollective.Classifier.fromRecord) .map(OCollective.Classifier.fromRecord)
else else
OptionT.none OptionT.none

View File

@ -84,6 +84,7 @@ object AttachmentPageCount {
Nil, Nil,
MetaProposalList.empty, MetaProposalList.empty,
md.pageCount.some, md.pageCount.some,
None,
None None
) )
) )

View File

@ -107,7 +107,8 @@ object CreateItem {
Vector.empty, Vector.empty,
fm.map(a => a.id -> a.fileId).toMap, fm.map(a => a.id -> a.fileId).toMap,
MetaProposalList.empty, MetaProposalList.empty,
Nil Nil,
None
) )
} }
@ -166,7 +167,8 @@ object CreateItem {
Vector.empty, Vector.empty,
origMap, origMap,
MetaProposalList.empty, MetaProposalList.empty,
Nil Nil,
None
) )
) )
} }

View File

@ -15,6 +15,9 @@ import docspell.store.records.{RAttachment, RAttachmentMeta, RItem}
* containng the source or origin file * containng the source or origin file
* @param givenMeta meta data to this item that was not "guessed" * @param givenMeta meta data to this item that was not "guessed"
* from an attachment but given and thus is always correct * from an attachment but given and thus is always correct
* @param classifyProposals these are proposals that were obtained by
* a trained classifier. There are no ner-tags, it will only provide a
* single label
*/ */
case class ItemData( case class ItemData(
item: RItem, item: RItem,
@ -23,7 +26,10 @@ case class ItemData(
dateLabels: Vector[AttachmentDates], dateLabels: Vector[AttachmentDates],
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
givenMeta: MetaProposalList, // given meta data not associated to a specific attachment givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
tags: List[String] // a list of tags (names or ids) attached to the item if they exist // a list of tags (names or ids) attached to the item if they exist
tags: List[String],
// proposals obtained from the classifier
classifyProposals: Option[MetaProposalList]
) { ) {
def findMeta(attachId: Ident): Option[RAttachmentMeta] = def findMeta(attachId: Ident): Option[RAttachmentMeta] =

View File

@ -65,7 +65,8 @@ object ReProcessItem {
Vector.empty, Vector.empty,
asrcMap.view.mapValues(_.fileId).toMap, asrcMap.view.mapValues(_.fileId).toMap,
MetaProposalList.empty, MetaProposalList.empty,
Nil Nil,
None
)).getOrElseF( )).getOrElseF(
Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}")) Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}"))
) )

View File

@ -17,7 +17,10 @@ object SaveProposals {
data.metas data.metas
.traverse(rm => .traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *> ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals)) ctx.store.transact(
RAttachmentMeta
.updateProposals(rm.id, rm.proposals, data.classifyProposals)
)
) )
.map(_ => data) .map(_ => data)
} }

View File

@ -1,9 +1,12 @@
package docspell.joex.process package docspell.joex.process
import cats.Traverse
import cats.effect._ import cats.effect._
import cats.implicits._ import cats.implicits._
import docspell.analysis.classifier.TextClassifier import docspell.analysis.classifier.TextClassifier
import docspell.analysis.{NlpSettings, TextAnalyser} import docspell.analysis.{NlpSettings, TextAnalyser}
import docspell.common.MetaProposal.Candidate
import docspell.common._ import docspell.common._
import docspell.joex.Config import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile import docspell.joex.analysis.RegexNerFile
@ -37,12 +40,22 @@ object TextAnalysis {
e <- s e <- s
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}") _ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
v = t.toVector v = t.toVector
classifierEnabled <- getActive(ctx, cfg) autoTagEnabled <- getActiveAutoTag(ctx, cfg)
tag <- tag <-
if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier) if (autoTagEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
else List.empty[String].pure[F] else List.empty[String].pure[F]
classProposals <-
if (cfg.classification.enabled)
predictItemEntities(ctx, cfg, item.metas, analyser.classifier)
else MetaProposalList.empty.pure[F]
} yield item } yield item
.copy(metas = v.map(_._1), dateLabels = v.map(_._2)) .copy(
metas = v.map(_._1),
dateLabels = v.map(_._2),
classifyProposals = classProposals.some
)
.appendTags(tag) .appendTags(tag)
} }
@ -72,15 +85,8 @@ object TextAnalysis {
): F[List[String]] = { ): F[List[String]] = {
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep) val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
val classifyWith: ClassifierName => F[Option[String]] = val classifyWith: ClassifierName => F[Option[String]] =
Classify[F]( makeClassify(ctx, cfg, classifier)(text)
ctx.blocker,
ctx.logger,
cfg.workingDir,
ctx.store,
classifier,
ctx.args.meta.collective,
text
)
for { for {
names <- ctx.store.transact( names <- ctx.store.transact(
ClassifierName.findTagClassifiers(ctx.args.meta.collective) ClassifierName.findTagClassifiers(ctx.args.meta.collective)
@ -90,14 +96,61 @@ object TextAnalysis {
} yield tags.flatten } yield tags.flatten
} }
private def getActive[F[_]: Sync]( def predictItemEntities[F[_]: Sync: ContextShift](
ctx: Context[F, Args],
cfg: Config.TextAnalysis,
metas: Vector[RAttachmentMeta],
classifier: TextClassifier[F]
): F[MetaProposalList] = {
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
def classifyWith(
cname: ClassifierName,
mtype: MetaProposalType
): F[Option[MetaProposal]] =
for {
_ <- ctx.logger.debug(s"Guessing $mtype using classifier")
label <- makeClassify(ctx, cfg, classifier)(text).apply(cname)
} yield label.map(str =>
MetaProposal(mtype, Candidate(IdRef(Ident.unsafe(""), str), Set.empty))
)
Traverse[List]
.sequence(
List(
classifyWith(ClassifierName.correspondentOrg, MetaProposalType.CorrOrg),
classifyWith(ClassifierName.correspondentPerson, MetaProposalType.CorrPerson),
classifyWith(ClassifierName.concernedPerson, MetaProposalType.ConcPerson),
classifyWith(ClassifierName.concernedEquip, MetaProposalType.ConcEquip)
)
)
.map(_.flatten)
.map(MetaProposalList.apply)
}
private def makeClassify[F[_]: Sync: ContextShift](
ctx: Context[F, Args],
cfg: Config.TextAnalysis,
classifier: TextClassifier[F]
)(text: String): ClassifierName => F[Option[String]] =
Classify[F](
ctx.blocker,
ctx.logger,
cfg.workingDir,
ctx.store,
classifier,
ctx.args.meta.collective,
text
)
private def getActiveAutoTag[F[_]: Sync](
ctx: Context[F, Args], ctx: Context[F, Args],
cfg: Config.TextAnalysis cfg: Config.TextAnalysis
): F[Boolean] = ): F[Boolean] =
if (cfg.classification.enabled) if (cfg.classification.enabled)
ctx.store ctx.store
.transact(RClassifierSetting.findById(ctx.args.meta.collective)) .transact(RClassifierSetting.findById(ctx.args.meta.collective))
.map(_.exists(_.enabled)) .map(_.exists(_.autoTagEnabled))
.flatTap(enabled => .flatTap(enabled =>
if (enabled) ().pure[F] if (enabled) ().pure[F]
else ctx.logger.info("Classification is disabled. Check config or settings.") else ctx.logger.info("Classification is disabled. Check config or settings.")

View File

@ -0,0 +1,3 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "classify_proposals" text;

View File

@ -0,0 +1,3 @@
ALTER TABLE `attachmentmeta`
ADD COLUMN (`classify_proposals` mediumtext);

View File

@ -0,0 +1,3 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "classify_proposals" text;

View File

@ -16,7 +16,8 @@ case class RAttachmentMeta(
nerlabels: List[NerLabel], nerlabels: List[NerLabel],
proposals: MetaProposalList, proposals: MetaProposalList,
pages: Option[Int], pages: Option[Int],
language: Option[Language] language: Option[Language],
classifyProposals: Option[MetaProposalList]
) { ) {
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
@ -29,19 +30,28 @@ case class RAttachmentMeta(
object RAttachmentMeta { object RAttachmentMeta {
def empty(attachId: Ident, lang: Language) = def empty(attachId: Ident, lang: Language) =
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang)) RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang), None)
final case class Table(alias: Option[String]) extends TableDef { final case class Table(alias: Option[String]) extends TableDef {
val tableName = "attachmentmeta" val tableName = "attachmentmeta"
val id = Column[Ident]("attachid", this) val id = Column[Ident]("attachid", this)
val content = Column[String]("content", this) val content = Column[String]("content", this)
val nerlabels = Column[List[NerLabel]]("nerlabels", this) val nerlabels = Column[List[NerLabel]]("nerlabels", this)
val proposals = Column[MetaProposalList]("itemproposals", this) val proposals = Column[MetaProposalList]("itemproposals", this)
val pages = Column[Int]("page_count", this) val pages = Column[Int]("page_count", this)
val language = Column[Language]("language", this) val language = Column[Language]("language", this)
val classifyProposals = Column[MetaProposalList]("classify_proposals", this)
val all = val all =
NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language) NonEmptyList.of[Column[_]](
id,
content,
nerlabels,
proposals,
pages,
language,
classifyProposals
)
} }
val T = Table(None) val T = Table(None)
@ -52,7 +62,7 @@ object RAttachmentMeta {
DML.insert( DML.insert(
T, T,
T.all, T.all,
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}" fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language},${v.classifyProposals}"
) )
def exists(attachId: Ident): ConnectionIO[Boolean] = def exists(attachId: Ident): ConnectionIO[Boolean] =
@ -80,7 +90,8 @@ object RAttachmentMeta {
DML.set( DML.set(
T.content.setTo(v.content), T.content.setTo(v.content),
T.nerlabels.setTo(v.nerlabels), T.nerlabels.setTo(v.nerlabels),
T.proposals.setTo(v.proposals) T.proposals.setTo(v.proposals),
T.classifyProposals.setTo(v.classifyProposals)
) )
) )
@ -93,12 +104,17 @@ object RAttachmentMeta {
) )
) )
def updateProposals(mid: Ident, plist: MetaProposalList): ConnectionIO[Int] = def updateProposals(
mid: Ident,
plist: MetaProposalList,
clist: Option[MetaProposalList]
): ConnectionIO[Int] =
DML.update( DML.update(
T, T,
T.id === mid, T.id === mid,
DML.set( DML.set(
T.proposals.setTo(plist) T.proposals.setTo(plist),
T.classifyProposals.setTo(clist)
) )
) )

View File

@ -20,7 +20,7 @@ case class RClassifierSetting(
listType: ListType listType: ListType
) { ) {
def enabled: Boolean = def autoTagEnabled: Boolean =
listType match { listType match {
case ListType.Blacklist => case ListType.Blacklist =>
true true