Run classifier for item entities (concerned, correspondent)

Store the results separately from nlp results in attachment metadata.
This commit is contained in:
Eike Kettner 2021-01-19 22:04:13 +01:00
parent d124f0c1a9
commit 1cd3441462
13 changed files with 131 additions and 37 deletions

View File

@ -1,15 +1,18 @@
package docspell.joex.learn
import java.nio.file.Path
import cats.implicits._
import bitpeace.RangeDef
import cats.data.OptionT
import cats.effect._
import docspell.store.Store
import cats.implicits._
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
import docspell.common._
import docspell.store.Store
import docspell.store.records.RClassifierModel
import bitpeace.RangeDef
object Classify {
def apply[F[_]: Sync: ContextShift](

View File

@ -89,7 +89,7 @@ object LearnClassifierTask {
): OptionT[F, OCollective.Classifier] =
if (cfg.classification.enabled)
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
.filter(_.enabled)
.filter(_.autoTagEnabled)
.map(OCollective.Classifier.fromRecord)
else
OptionT.none

View File

@ -84,6 +84,7 @@ object AttachmentPageCount {
Nil,
MetaProposalList.empty,
md.pageCount.some,
None,
None
)
)

View File

@ -107,7 +107,8 @@ object CreateItem {
Vector.empty,
fm.map(a => a.id -> a.fileId).toMap,
MetaProposalList.empty,
Nil
Nil,
None
)
}
@ -166,7 +167,8 @@ object CreateItem {
Vector.empty,
origMap,
MetaProposalList.empty,
Nil
Nil,
None
)
)
}

View File

@ -15,6 +15,9 @@ import docspell.store.records.{RAttachment, RAttachmentMeta, RItem}
* containng the source or origin file
* @param givenMeta meta data to this item that was not "guessed"
* from an attachment but given and thus is always correct
* @param classifyProposals these are proposals that were obtained by
* a trained classifier. There are no ner-tags, it will only provide a
* single label
*/
case class ItemData(
item: RItem,
@ -23,7 +26,10 @@ case class ItemData(
dateLabels: Vector[AttachmentDates],
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
tags: List[String] // a list of tags (names or ids) attached to the item if they exist
// a list of tags (names or ids) attached to the item if they exist
tags: List[String],
// proposals obtained from the classifier
classifyProposals: Option[MetaProposalList]
) {
def findMeta(attachId: Ident): Option[RAttachmentMeta] =

View File

@ -65,7 +65,8 @@ object ReProcessItem {
Vector.empty,
asrcMap.view.mapValues(_.fileId).toMap,
MetaProposalList.empty,
Nil
Nil,
None
)).getOrElseF(
Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}"))
)

View File

@ -17,7 +17,10 @@ object SaveProposals {
data.metas
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
ctx.store.transact(
RAttachmentMeta
.updateProposals(rm.id, rm.proposals, data.classifyProposals)
)
)
.map(_ => data)
}

View File

@ -1,9 +1,12 @@
package docspell.joex.process
import cats.Traverse
import cats.effect._
import cats.implicits._
import docspell.analysis.classifier.TextClassifier
import docspell.analysis.{NlpSettings, TextAnalyser}
import docspell.common.MetaProposal.Candidate
import docspell.common._
import docspell.joex.Config
import docspell.joex.analysis.RegexNerFile
@ -37,12 +40,22 @@ object TextAnalysis {
e <- s
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
v = t.toVector
classifierEnabled <- getActive(ctx, cfg)
autoTagEnabled <- getActiveAutoTag(ctx, cfg)
tag <-
if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
if (autoTagEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
else List.empty[String].pure[F]
classProposals <-
if (cfg.classification.enabled)
predictItemEntities(ctx, cfg, item.metas, analyser.classifier)
else MetaProposalList.empty.pure[F]
} yield item
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
.copy(
metas = v.map(_._1),
dateLabels = v.map(_._2),
classifyProposals = classProposals.some
)
.appendTags(tag)
}
@ -72,15 +85,8 @@ object TextAnalysis {
): F[List[String]] = {
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
val classifyWith: ClassifierName => F[Option[String]] =
Classify[F](
ctx.blocker,
ctx.logger,
cfg.workingDir,
ctx.store,
classifier,
ctx.args.meta.collective,
text
)
makeClassify(ctx, cfg, classifier)(text)
for {
names <- ctx.store.transact(
ClassifierName.findTagClassifiers(ctx.args.meta.collective)
@ -90,14 +96,61 @@ object TextAnalysis {
} yield tags.flatten
}
private def getActive[F[_]: Sync](
def predictItemEntities[F[_]: Sync: ContextShift](
ctx: Context[F, Args],
cfg: Config.TextAnalysis,
metas: Vector[RAttachmentMeta],
classifier: TextClassifier[F]
): F[MetaProposalList] = {
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
def classifyWith(
cname: ClassifierName,
mtype: MetaProposalType
): F[Option[MetaProposal]] =
for {
_ <- ctx.logger.debug(s"Guessing $mtype using classifier")
label <- makeClassify(ctx, cfg, classifier)(text).apply(cname)
} yield label.map(str =>
MetaProposal(mtype, Candidate(IdRef(Ident.unsafe(""), str), Set.empty))
)
Traverse[List]
.sequence(
List(
classifyWith(ClassifierName.correspondentOrg, MetaProposalType.CorrOrg),
classifyWith(ClassifierName.correspondentPerson, MetaProposalType.CorrPerson),
classifyWith(ClassifierName.concernedPerson, MetaProposalType.ConcPerson),
classifyWith(ClassifierName.concernedEquip, MetaProposalType.ConcEquip)
)
)
.map(_.flatten)
.map(MetaProposalList.apply)
}
private def makeClassify[F[_]: Sync: ContextShift](
ctx: Context[F, Args],
cfg: Config.TextAnalysis,
classifier: TextClassifier[F]
)(text: String): ClassifierName => F[Option[String]] =
Classify[F](
ctx.blocker,
ctx.logger,
cfg.workingDir,
ctx.store,
classifier,
ctx.args.meta.collective,
text
)
private def getActiveAutoTag[F[_]: Sync](
ctx: Context[F, Args],
cfg: Config.TextAnalysis
): F[Boolean] =
if (cfg.classification.enabled)
ctx.store
.transact(RClassifierSetting.findById(ctx.args.meta.collective))
.map(_.exists(_.enabled))
.map(_.exists(_.autoTagEnabled))
.flatTap(enabled =>
if (enabled) ().pure[F]
else ctx.logger.info("Classification is disabled. Check config or settings.")

View File

@ -0,0 +1,3 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "classify_proposals" text;

View File

@ -0,0 +1,3 @@
ALTER TABLE `attachmentmeta`
ADD COLUMN (`classify_proposals` mediumtext);

View File

@ -0,0 +1,3 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "classify_proposals" text;

View File

@ -16,7 +16,8 @@ case class RAttachmentMeta(
nerlabels: List[NerLabel],
proposals: MetaProposalList,
pages: Option[Int],
language: Option[Language]
language: Option[Language],
classifyProposals: Option[MetaProposalList]
) {
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
@ -29,19 +30,28 @@ case class RAttachmentMeta(
object RAttachmentMeta {
def empty(attachId: Ident, lang: Language) =
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang))
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang), None)
final case class Table(alias: Option[String]) extends TableDef {
val tableName = "attachmentmeta"
val id = Column[Ident]("attachid", this)
val content = Column[String]("content", this)
val nerlabels = Column[List[NerLabel]]("nerlabels", this)
val proposals = Column[MetaProposalList]("itemproposals", this)
val pages = Column[Int]("page_count", this)
val language = Column[Language]("language", this)
val id = Column[Ident]("attachid", this)
val content = Column[String]("content", this)
val nerlabels = Column[List[NerLabel]]("nerlabels", this)
val proposals = Column[MetaProposalList]("itemproposals", this)
val pages = Column[Int]("page_count", this)
val language = Column[Language]("language", this)
val classifyProposals = Column[MetaProposalList]("classify_proposals", this)
val all =
NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language)
NonEmptyList.of[Column[_]](
id,
content,
nerlabels,
proposals,
pages,
language,
classifyProposals
)
}
val T = Table(None)
@ -52,7 +62,7 @@ object RAttachmentMeta {
DML.insert(
T,
T.all,
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}"
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language},${v.classifyProposals}"
)
def exists(attachId: Ident): ConnectionIO[Boolean] =
@ -80,7 +90,8 @@ object RAttachmentMeta {
DML.set(
T.content.setTo(v.content),
T.nerlabels.setTo(v.nerlabels),
T.proposals.setTo(v.proposals)
T.proposals.setTo(v.proposals),
T.classifyProposals.setTo(v.classifyProposals)
)
)
@ -93,12 +104,17 @@ object RAttachmentMeta {
)
)
def updateProposals(mid: Ident, plist: MetaProposalList): ConnectionIO[Int] =
def updateProposals(
mid: Ident,
plist: MetaProposalList,
clist: Option[MetaProposalList]
): ConnectionIO[Int] =
DML.update(
T,
T.id === mid,
DML.set(
T.proposals.setTo(plist)
T.proposals.setTo(plist),
T.classifyProposals.setTo(clist)
)
)

View File

@ -20,7 +20,7 @@ case class RClassifierSetting(
listType: ListType
) {
def enabled: Boolean =
def autoTagEnabled: Boolean =
listType match {
case ListType.Blacklist =>
true