mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 18:39:33 +00:00
Run classifier for item entities (concerned, correspondent)
Store the results separately from nlp results in attachment metadata.
This commit is contained in:
parent
d124f0c1a9
commit
1cd3441462
@ -1,15 +1,18 @@
|
||||
package docspell.joex.learn
|
||||
|
||||
import java.nio.file.Path
|
||||
import cats.implicits._
|
||||
import bitpeace.RangeDef
|
||||
|
||||
import cats.data.OptionT
|
||||
import cats.effect._
|
||||
import docspell.store.Store
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
||||
import docspell.common._
|
||||
import docspell.store.Store
|
||||
import docspell.store.records.RClassifierModel
|
||||
|
||||
import bitpeace.RangeDef
|
||||
|
||||
object Classify {
|
||||
|
||||
def apply[F[_]: Sync: ContextShift](
|
||||
|
@ -89,7 +89,7 @@ object LearnClassifierTask {
|
||||
): OptionT[F, OCollective.Classifier] =
|
||||
if (cfg.classification.enabled)
|
||||
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
|
||||
.filter(_.enabled)
|
||||
.filter(_.autoTagEnabled)
|
||||
.map(OCollective.Classifier.fromRecord)
|
||||
else
|
||||
OptionT.none
|
||||
|
@ -84,6 +84,7 @@ object AttachmentPageCount {
|
||||
Nil,
|
||||
MetaProposalList.empty,
|
||||
md.pageCount.some,
|
||||
None,
|
||||
None
|
||||
)
|
||||
)
|
||||
|
@ -107,7 +107,8 @@ object CreateItem {
|
||||
Vector.empty,
|
||||
fm.map(a => a.id -> a.fileId).toMap,
|
||||
MetaProposalList.empty,
|
||||
Nil
|
||||
Nil,
|
||||
None
|
||||
)
|
||||
}
|
||||
|
||||
@ -166,7 +167,8 @@ object CreateItem {
|
||||
Vector.empty,
|
||||
origMap,
|
||||
MetaProposalList.empty,
|
||||
Nil
|
||||
Nil,
|
||||
None
|
||||
)
|
||||
)
|
||||
}
|
||||
|
@ -15,6 +15,9 @@ import docspell.store.records.{RAttachment, RAttachmentMeta, RItem}
|
||||
* containng the source or origin file
|
||||
* @param givenMeta meta data to this item that was not "guessed"
|
||||
* from an attachment but given and thus is always correct
|
||||
* @param classifyProposals these are proposals that were obtained by
|
||||
* a trained classifier. There are no ner-tags, it will only provide a
|
||||
* single label
|
||||
*/
|
||||
case class ItemData(
|
||||
item: RItem,
|
||||
@ -23,7 +26,10 @@ case class ItemData(
|
||||
dateLabels: Vector[AttachmentDates],
|
||||
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
|
||||
givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
|
||||
tags: List[String] // a list of tags (names or ids) attached to the item if they exist
|
||||
// a list of tags (names or ids) attached to the item if they exist
|
||||
tags: List[String],
|
||||
// proposals obtained from the classifier
|
||||
classifyProposals: Option[MetaProposalList]
|
||||
) {
|
||||
|
||||
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
||||
|
@ -65,7 +65,8 @@ object ReProcessItem {
|
||||
Vector.empty,
|
||||
asrcMap.view.mapValues(_.fileId).toMap,
|
||||
MetaProposalList.empty,
|
||||
Nil
|
||||
Nil,
|
||||
None
|
||||
)).getOrElseF(
|
||||
Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}"))
|
||||
)
|
||||
|
@ -17,7 +17,10 @@ object SaveProposals {
|
||||
data.metas
|
||||
.traverse(rm =>
|
||||
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
|
||||
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
|
||||
ctx.store.transact(
|
||||
RAttachmentMeta
|
||||
.updateProposals(rm.id, rm.proposals, data.classifyProposals)
|
||||
)
|
||||
)
|
||||
.map(_ => data)
|
||||
}
|
||||
|
@ -1,9 +1,12 @@
|
||||
package docspell.joex.process
|
||||
|
||||
import cats.Traverse
|
||||
import cats.effect._
|
||||
import cats.implicits._
|
||||
|
||||
import docspell.analysis.classifier.TextClassifier
|
||||
import docspell.analysis.{NlpSettings, TextAnalyser}
|
||||
import docspell.common.MetaProposal.Candidate
|
||||
import docspell.common._
|
||||
import docspell.joex.Config
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
@ -37,12 +40,22 @@ object TextAnalysis {
|
||||
e <- s
|
||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||
v = t.toVector
|
||||
classifierEnabled <- getActive(ctx, cfg)
|
||||
autoTagEnabled <- getActiveAutoTag(ctx, cfg)
|
||||
tag <-
|
||||
if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
|
||||
if (autoTagEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
|
||||
else List.empty[String].pure[F]
|
||||
|
||||
classProposals <-
|
||||
if (cfg.classification.enabled)
|
||||
predictItemEntities(ctx, cfg, item.metas, analyser.classifier)
|
||||
else MetaProposalList.empty.pure[F]
|
||||
|
||||
} yield item
|
||||
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||
.copy(
|
||||
metas = v.map(_._1),
|
||||
dateLabels = v.map(_._2),
|
||||
classifyProposals = classProposals.some
|
||||
)
|
||||
.appendTags(tag)
|
||||
}
|
||||
|
||||
@ -72,15 +85,8 @@ object TextAnalysis {
|
||||
): F[List[String]] = {
|
||||
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||
val classifyWith: ClassifierName => F[Option[String]] =
|
||||
Classify[F](
|
||||
ctx.blocker,
|
||||
ctx.logger,
|
||||
cfg.workingDir,
|
||||
ctx.store,
|
||||
classifier,
|
||||
ctx.args.meta.collective,
|
||||
text
|
||||
)
|
||||
makeClassify(ctx, cfg, classifier)(text)
|
||||
|
||||
for {
|
||||
names <- ctx.store.transact(
|
||||
ClassifierName.findTagClassifiers(ctx.args.meta.collective)
|
||||
@ -90,14 +96,61 @@ object TextAnalysis {
|
||||
} yield tags.flatten
|
||||
}
|
||||
|
||||
private def getActive[F[_]: Sync](
|
||||
def predictItemEntities[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, Args],
|
||||
cfg: Config.TextAnalysis,
|
||||
metas: Vector[RAttachmentMeta],
|
||||
classifier: TextClassifier[F]
|
||||
): F[MetaProposalList] = {
|
||||
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||
|
||||
def classifyWith(
|
||||
cname: ClassifierName,
|
||||
mtype: MetaProposalType
|
||||
): F[Option[MetaProposal]] =
|
||||
for {
|
||||
_ <- ctx.logger.debug(s"Guessing $mtype using classifier")
|
||||
label <- makeClassify(ctx, cfg, classifier)(text).apply(cname)
|
||||
} yield label.map(str =>
|
||||
MetaProposal(mtype, Candidate(IdRef(Ident.unsafe(""), str), Set.empty))
|
||||
)
|
||||
|
||||
Traverse[List]
|
||||
.sequence(
|
||||
List(
|
||||
classifyWith(ClassifierName.correspondentOrg, MetaProposalType.CorrOrg),
|
||||
classifyWith(ClassifierName.correspondentPerson, MetaProposalType.CorrPerson),
|
||||
classifyWith(ClassifierName.concernedPerson, MetaProposalType.ConcPerson),
|
||||
classifyWith(ClassifierName.concernedEquip, MetaProposalType.ConcEquip)
|
||||
)
|
||||
)
|
||||
.map(_.flatten)
|
||||
.map(MetaProposalList.apply)
|
||||
}
|
||||
|
||||
private def makeClassify[F[_]: Sync: ContextShift](
|
||||
ctx: Context[F, Args],
|
||||
cfg: Config.TextAnalysis,
|
||||
classifier: TextClassifier[F]
|
||||
)(text: String): ClassifierName => F[Option[String]] =
|
||||
Classify[F](
|
||||
ctx.blocker,
|
||||
ctx.logger,
|
||||
cfg.workingDir,
|
||||
ctx.store,
|
||||
classifier,
|
||||
ctx.args.meta.collective,
|
||||
text
|
||||
)
|
||||
|
||||
private def getActiveAutoTag[F[_]: Sync](
|
||||
ctx: Context[F, Args],
|
||||
cfg: Config.TextAnalysis
|
||||
): F[Boolean] =
|
||||
if (cfg.classification.enabled)
|
||||
ctx.store
|
||||
.transact(RClassifierSetting.findById(ctx.args.meta.collective))
|
||||
.map(_.exists(_.enabled))
|
||||
.map(_.exists(_.autoTagEnabled))
|
||||
.flatTap(enabled =>
|
||||
if (enabled) ().pure[F]
|
||||
else ctx.logger.info("Classification is disabled. Check config or settings.")
|
||||
|
@ -0,0 +1,3 @@
|
||||
ALTER TABLE "attachmentmeta"
|
||||
ADD COLUMN "classify_proposals" text;
|
||||
|
@ -0,0 +1,3 @@
|
||||
ALTER TABLE `attachmentmeta`
|
||||
ADD COLUMN (`classify_proposals` mediumtext);
|
||||
|
@ -0,0 +1,3 @@
|
||||
ALTER TABLE "attachmentmeta"
|
||||
ADD COLUMN "classify_proposals" text;
|
||||
|
@ -16,7 +16,8 @@ case class RAttachmentMeta(
|
||||
nerlabels: List[NerLabel],
|
||||
proposals: MetaProposalList,
|
||||
pages: Option[Int],
|
||||
language: Option[Language]
|
||||
language: Option[Language],
|
||||
classifyProposals: Option[MetaProposalList]
|
||||
) {
|
||||
|
||||
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
|
||||
@ -29,19 +30,28 @@ case class RAttachmentMeta(
|
||||
|
||||
object RAttachmentMeta {
|
||||
def empty(attachId: Ident, lang: Language) =
|
||||
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang))
|
||||
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang), None)
|
||||
|
||||
final case class Table(alias: Option[String]) extends TableDef {
|
||||
val tableName = "attachmentmeta"
|
||||
|
||||
val id = Column[Ident]("attachid", this)
|
||||
val content = Column[String]("content", this)
|
||||
val nerlabels = Column[List[NerLabel]]("nerlabels", this)
|
||||
val proposals = Column[MetaProposalList]("itemproposals", this)
|
||||
val pages = Column[Int]("page_count", this)
|
||||
val language = Column[Language]("language", this)
|
||||
val id = Column[Ident]("attachid", this)
|
||||
val content = Column[String]("content", this)
|
||||
val nerlabels = Column[List[NerLabel]]("nerlabels", this)
|
||||
val proposals = Column[MetaProposalList]("itemproposals", this)
|
||||
val pages = Column[Int]("page_count", this)
|
||||
val language = Column[Language]("language", this)
|
||||
val classifyProposals = Column[MetaProposalList]("classify_proposals", this)
|
||||
val all =
|
||||
NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language)
|
||||
NonEmptyList.of[Column[_]](
|
||||
id,
|
||||
content,
|
||||
nerlabels,
|
||||
proposals,
|
||||
pages,
|
||||
language,
|
||||
classifyProposals
|
||||
)
|
||||
}
|
||||
|
||||
val T = Table(None)
|
||||
@ -52,7 +62,7 @@ object RAttachmentMeta {
|
||||
DML.insert(
|
||||
T,
|
||||
T.all,
|
||||
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}"
|
||||
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language},${v.classifyProposals}"
|
||||
)
|
||||
|
||||
def exists(attachId: Ident): ConnectionIO[Boolean] =
|
||||
@ -80,7 +90,8 @@ object RAttachmentMeta {
|
||||
DML.set(
|
||||
T.content.setTo(v.content),
|
||||
T.nerlabels.setTo(v.nerlabels),
|
||||
T.proposals.setTo(v.proposals)
|
||||
T.proposals.setTo(v.proposals),
|
||||
T.classifyProposals.setTo(v.classifyProposals)
|
||||
)
|
||||
)
|
||||
|
||||
@ -93,12 +104,17 @@ object RAttachmentMeta {
|
||||
)
|
||||
)
|
||||
|
||||
def updateProposals(mid: Ident, plist: MetaProposalList): ConnectionIO[Int] =
|
||||
def updateProposals(
|
||||
mid: Ident,
|
||||
plist: MetaProposalList,
|
||||
clist: Option[MetaProposalList]
|
||||
): ConnectionIO[Int] =
|
||||
DML.update(
|
||||
T,
|
||||
T.id === mid,
|
||||
DML.set(
|
||||
T.proposals.setTo(plist)
|
||||
T.proposals.setTo(plist),
|
||||
T.classifyProposals.setTo(clist)
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -20,7 +20,7 @@ case class RClassifierSetting(
|
||||
listType: ListType
|
||||
) {
|
||||
|
||||
def enabled: Boolean =
|
||||
def autoTagEnabled: Boolean =
|
||||
listType match {
|
||||
case ListType.Blacklist =>
|
||||
true
|
||||
|
Loading…
x
Reference in New Issue
Block a user