mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-07 15:45:59 +00:00
Run classifier for item entities (concerned, correspondent)
Store the results separately from nlp results in attachment metadata.
This commit is contained in:
parent
d124f0c1a9
commit
1cd3441462
@ -1,15 +1,18 @@
|
|||||||
package docspell.joex.learn
|
package docspell.joex.learn
|
||||||
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import cats.implicits._
|
|
||||||
import bitpeace.RangeDef
|
|
||||||
import cats.data.OptionT
|
import cats.data.OptionT
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import docspell.store.Store
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
import docspell.analysis.classifier.{ClassifierModel, TextClassifier}
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
|
import docspell.store.Store
|
||||||
import docspell.store.records.RClassifierModel
|
import docspell.store.records.RClassifierModel
|
||||||
|
|
||||||
|
import bitpeace.RangeDef
|
||||||
|
|
||||||
object Classify {
|
object Classify {
|
||||||
|
|
||||||
def apply[F[_]: Sync: ContextShift](
|
def apply[F[_]: Sync: ContextShift](
|
||||||
|
@ -89,7 +89,7 @@ object LearnClassifierTask {
|
|||||||
): OptionT[F, OCollective.Classifier] =
|
): OptionT[F, OCollective.Classifier] =
|
||||||
if (cfg.classification.enabled)
|
if (cfg.classification.enabled)
|
||||||
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
|
OptionT(ctx.store.transact(RClassifierSetting.findById(ctx.args.collective)))
|
||||||
.filter(_.enabled)
|
.filter(_.autoTagEnabled)
|
||||||
.map(OCollective.Classifier.fromRecord)
|
.map(OCollective.Classifier.fromRecord)
|
||||||
else
|
else
|
||||||
OptionT.none
|
OptionT.none
|
||||||
|
@ -84,6 +84,7 @@ object AttachmentPageCount {
|
|||||||
Nil,
|
Nil,
|
||||||
MetaProposalList.empty,
|
MetaProposalList.empty,
|
||||||
md.pageCount.some,
|
md.pageCount.some,
|
||||||
|
None,
|
||||||
None
|
None
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -107,7 +107,8 @@ object CreateItem {
|
|||||||
Vector.empty,
|
Vector.empty,
|
||||||
fm.map(a => a.id -> a.fileId).toMap,
|
fm.map(a => a.id -> a.fileId).toMap,
|
||||||
MetaProposalList.empty,
|
MetaProposalList.empty,
|
||||||
Nil
|
Nil,
|
||||||
|
None
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -166,7 +167,8 @@ object CreateItem {
|
|||||||
Vector.empty,
|
Vector.empty,
|
||||||
origMap,
|
origMap,
|
||||||
MetaProposalList.empty,
|
MetaProposalList.empty,
|
||||||
Nil
|
Nil,
|
||||||
|
None
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -15,6 +15,9 @@ import docspell.store.records.{RAttachment, RAttachmentMeta, RItem}
|
|||||||
* containng the source or origin file
|
* containng the source or origin file
|
||||||
* @param givenMeta meta data to this item that was not "guessed"
|
* @param givenMeta meta data to this item that was not "guessed"
|
||||||
* from an attachment but given and thus is always correct
|
* from an attachment but given and thus is always correct
|
||||||
|
* @param classifyProposals these are proposals that were obtained by
|
||||||
|
* a trained classifier. There are no ner-tags, it will only provide a
|
||||||
|
* single label
|
||||||
*/
|
*/
|
||||||
case class ItemData(
|
case class ItemData(
|
||||||
item: RItem,
|
item: RItem,
|
||||||
@ -23,7 +26,10 @@ case class ItemData(
|
|||||||
dateLabels: Vector[AttachmentDates],
|
dateLabels: Vector[AttachmentDates],
|
||||||
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
|
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
|
||||||
givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
|
givenMeta: MetaProposalList, // given meta data not associated to a specific attachment
|
||||||
tags: List[String] // a list of tags (names or ids) attached to the item if they exist
|
// a list of tags (names or ids) attached to the item if they exist
|
||||||
|
tags: List[String],
|
||||||
|
// proposals obtained from the classifier
|
||||||
|
classifyProposals: Option[MetaProposalList]
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
||||||
|
@ -65,7 +65,8 @@ object ReProcessItem {
|
|||||||
Vector.empty,
|
Vector.empty,
|
||||||
asrcMap.view.mapValues(_.fileId).toMap,
|
asrcMap.view.mapValues(_.fileId).toMap,
|
||||||
MetaProposalList.empty,
|
MetaProposalList.empty,
|
||||||
Nil
|
Nil,
|
||||||
|
None
|
||||||
)).getOrElseF(
|
)).getOrElseF(
|
||||||
Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}"))
|
Sync[F].raiseError(new Exception(s"Item not found: ${ctx.args.itemId.id}"))
|
||||||
)
|
)
|
||||||
|
@ -17,7 +17,10 @@ object SaveProposals {
|
|||||||
data.metas
|
data.metas
|
||||||
.traverse(rm =>
|
.traverse(rm =>
|
||||||
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
|
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
|
||||||
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
|
ctx.store.transact(
|
||||||
|
RAttachmentMeta
|
||||||
|
.updateProposals(rm.id, rm.proposals, data.classifyProposals)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
.map(_ => data)
|
.map(_ => data)
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,12 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
|
import cats.Traverse
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
|
|
||||||
import docspell.analysis.classifier.TextClassifier
|
import docspell.analysis.classifier.TextClassifier
|
||||||
import docspell.analysis.{NlpSettings, TextAnalyser}
|
import docspell.analysis.{NlpSettings, TextAnalyser}
|
||||||
|
import docspell.common.MetaProposal.Candidate
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
@ -37,12 +40,22 @@ object TextAnalysis {
|
|||||||
e <- s
|
e <- s
|
||||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||||
v = t.toVector
|
v = t.toVector
|
||||||
classifierEnabled <- getActive(ctx, cfg)
|
autoTagEnabled <- getActiveAutoTag(ctx, cfg)
|
||||||
tag <-
|
tag <-
|
||||||
if (classifierEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
|
if (autoTagEnabled) predictTags(ctx, cfg, item.metas, analyser.classifier)
|
||||||
else List.empty[String].pure[F]
|
else List.empty[String].pure[F]
|
||||||
|
|
||||||
|
classProposals <-
|
||||||
|
if (cfg.classification.enabled)
|
||||||
|
predictItemEntities(ctx, cfg, item.metas, analyser.classifier)
|
||||||
|
else MetaProposalList.empty.pure[F]
|
||||||
|
|
||||||
} yield item
|
} yield item
|
||||||
.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
.copy(
|
||||||
|
metas = v.map(_._1),
|
||||||
|
dateLabels = v.map(_._2),
|
||||||
|
classifyProposals = classProposals.some
|
||||||
|
)
|
||||||
.appendTags(tag)
|
.appendTags(tag)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,15 +85,8 @@ object TextAnalysis {
|
|||||||
): F[List[String]] = {
|
): F[List[String]] = {
|
||||||
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||||
val classifyWith: ClassifierName => F[Option[String]] =
|
val classifyWith: ClassifierName => F[Option[String]] =
|
||||||
Classify[F](
|
makeClassify(ctx, cfg, classifier)(text)
|
||||||
ctx.blocker,
|
|
||||||
ctx.logger,
|
|
||||||
cfg.workingDir,
|
|
||||||
ctx.store,
|
|
||||||
classifier,
|
|
||||||
ctx.args.meta.collective,
|
|
||||||
text
|
|
||||||
)
|
|
||||||
for {
|
for {
|
||||||
names <- ctx.store.transact(
|
names <- ctx.store.transact(
|
||||||
ClassifierName.findTagClassifiers(ctx.args.meta.collective)
|
ClassifierName.findTagClassifiers(ctx.args.meta.collective)
|
||||||
@ -90,14 +96,61 @@ object TextAnalysis {
|
|||||||
} yield tags.flatten
|
} yield tags.flatten
|
||||||
}
|
}
|
||||||
|
|
||||||
private def getActive[F[_]: Sync](
|
def predictItemEntities[F[_]: Sync: ContextShift](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
cfg: Config.TextAnalysis,
|
||||||
|
metas: Vector[RAttachmentMeta],
|
||||||
|
classifier: TextClassifier[F]
|
||||||
|
): F[MetaProposalList] = {
|
||||||
|
val text = metas.flatMap(_.content).mkString(LearnClassifierTask.pageSep)
|
||||||
|
|
||||||
|
def classifyWith(
|
||||||
|
cname: ClassifierName,
|
||||||
|
mtype: MetaProposalType
|
||||||
|
): F[Option[MetaProposal]] =
|
||||||
|
for {
|
||||||
|
_ <- ctx.logger.debug(s"Guessing $mtype using classifier")
|
||||||
|
label <- makeClassify(ctx, cfg, classifier)(text).apply(cname)
|
||||||
|
} yield label.map(str =>
|
||||||
|
MetaProposal(mtype, Candidate(IdRef(Ident.unsafe(""), str), Set.empty))
|
||||||
|
)
|
||||||
|
|
||||||
|
Traverse[List]
|
||||||
|
.sequence(
|
||||||
|
List(
|
||||||
|
classifyWith(ClassifierName.correspondentOrg, MetaProposalType.CorrOrg),
|
||||||
|
classifyWith(ClassifierName.correspondentPerson, MetaProposalType.CorrPerson),
|
||||||
|
classifyWith(ClassifierName.concernedPerson, MetaProposalType.ConcPerson),
|
||||||
|
classifyWith(ClassifierName.concernedEquip, MetaProposalType.ConcEquip)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.map(_.flatten)
|
||||||
|
.map(MetaProposalList.apply)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def makeClassify[F[_]: Sync: ContextShift](
|
||||||
|
ctx: Context[F, Args],
|
||||||
|
cfg: Config.TextAnalysis,
|
||||||
|
classifier: TextClassifier[F]
|
||||||
|
)(text: String): ClassifierName => F[Option[String]] =
|
||||||
|
Classify[F](
|
||||||
|
ctx.blocker,
|
||||||
|
ctx.logger,
|
||||||
|
cfg.workingDir,
|
||||||
|
ctx.store,
|
||||||
|
classifier,
|
||||||
|
ctx.args.meta.collective,
|
||||||
|
text
|
||||||
|
)
|
||||||
|
|
||||||
|
private def getActiveAutoTag[F[_]: Sync](
|
||||||
ctx: Context[F, Args],
|
ctx: Context[F, Args],
|
||||||
cfg: Config.TextAnalysis
|
cfg: Config.TextAnalysis
|
||||||
): F[Boolean] =
|
): F[Boolean] =
|
||||||
if (cfg.classification.enabled)
|
if (cfg.classification.enabled)
|
||||||
ctx.store
|
ctx.store
|
||||||
.transact(RClassifierSetting.findById(ctx.args.meta.collective))
|
.transact(RClassifierSetting.findById(ctx.args.meta.collective))
|
||||||
.map(_.exists(_.enabled))
|
.map(_.exists(_.autoTagEnabled))
|
||||||
.flatTap(enabled =>
|
.flatTap(enabled =>
|
||||||
if (enabled) ().pure[F]
|
if (enabled) ().pure[F]
|
||||||
else ctx.logger.info("Classification is disabled. Check config or settings.")
|
else ctx.logger.info("Classification is disabled. Check config or settings.")
|
||||||
|
@ -0,0 +1,3 @@
|
|||||||
|
ALTER TABLE "attachmentmeta"
|
||||||
|
ADD COLUMN "classify_proposals" text;
|
||||||
|
|
@ -0,0 +1,3 @@
|
|||||||
|
ALTER TABLE `attachmentmeta`
|
||||||
|
ADD COLUMN (`classify_proposals` mediumtext);
|
||||||
|
|
@ -0,0 +1,3 @@
|
|||||||
|
ALTER TABLE "attachmentmeta"
|
||||||
|
ADD COLUMN "classify_proposals" text;
|
||||||
|
|
@ -16,7 +16,8 @@ case class RAttachmentMeta(
|
|||||||
nerlabels: List[NerLabel],
|
nerlabels: List[NerLabel],
|
||||||
proposals: MetaProposalList,
|
proposals: MetaProposalList,
|
||||||
pages: Option[Int],
|
pages: Option[Int],
|
||||||
language: Option[Language]
|
language: Option[Language],
|
||||||
|
classifyProposals: Option[MetaProposalList]
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
|
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
|
||||||
@ -29,7 +30,7 @@ case class RAttachmentMeta(
|
|||||||
|
|
||||||
object RAttachmentMeta {
|
object RAttachmentMeta {
|
||||||
def empty(attachId: Ident, lang: Language) =
|
def empty(attachId: Ident, lang: Language) =
|
||||||
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang))
|
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang), None)
|
||||||
|
|
||||||
final case class Table(alias: Option[String]) extends TableDef {
|
final case class Table(alias: Option[String]) extends TableDef {
|
||||||
val tableName = "attachmentmeta"
|
val tableName = "attachmentmeta"
|
||||||
@ -40,8 +41,17 @@ object RAttachmentMeta {
|
|||||||
val proposals = Column[MetaProposalList]("itemproposals", this)
|
val proposals = Column[MetaProposalList]("itemproposals", this)
|
||||||
val pages = Column[Int]("page_count", this)
|
val pages = Column[Int]("page_count", this)
|
||||||
val language = Column[Language]("language", this)
|
val language = Column[Language]("language", this)
|
||||||
|
val classifyProposals = Column[MetaProposalList]("classify_proposals", this)
|
||||||
val all =
|
val all =
|
||||||
NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language)
|
NonEmptyList.of[Column[_]](
|
||||||
|
id,
|
||||||
|
content,
|
||||||
|
nerlabels,
|
||||||
|
proposals,
|
||||||
|
pages,
|
||||||
|
language,
|
||||||
|
classifyProposals
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
val T = Table(None)
|
val T = Table(None)
|
||||||
@ -52,7 +62,7 @@ object RAttachmentMeta {
|
|||||||
DML.insert(
|
DML.insert(
|
||||||
T,
|
T,
|
||||||
T.all,
|
T.all,
|
||||||
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}"
|
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language},${v.classifyProposals}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def exists(attachId: Ident): ConnectionIO[Boolean] =
|
def exists(attachId: Ident): ConnectionIO[Boolean] =
|
||||||
@ -80,7 +90,8 @@ object RAttachmentMeta {
|
|||||||
DML.set(
|
DML.set(
|
||||||
T.content.setTo(v.content),
|
T.content.setTo(v.content),
|
||||||
T.nerlabels.setTo(v.nerlabels),
|
T.nerlabels.setTo(v.nerlabels),
|
||||||
T.proposals.setTo(v.proposals)
|
T.proposals.setTo(v.proposals),
|
||||||
|
T.classifyProposals.setTo(v.classifyProposals)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -93,12 +104,17 @@ object RAttachmentMeta {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def updateProposals(mid: Ident, plist: MetaProposalList): ConnectionIO[Int] =
|
def updateProposals(
|
||||||
|
mid: Ident,
|
||||||
|
plist: MetaProposalList,
|
||||||
|
clist: Option[MetaProposalList]
|
||||||
|
): ConnectionIO[Int] =
|
||||||
DML.update(
|
DML.update(
|
||||||
T,
|
T,
|
||||||
T.id === mid,
|
T.id === mid,
|
||||||
DML.set(
|
DML.set(
|
||||||
T.proposals.setTo(plist)
|
T.proposals.setTo(plist),
|
||||||
|
T.classifyProposals.setTo(clist)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ case class RClassifierSetting(
|
|||||||
listType: ListType
|
listType: ListType
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def enabled: Boolean =
|
def autoTagEnabled: Boolean =
|
||||||
listType match {
|
listType match {
|
||||||
case ListType.Blacklist =>
|
case ListType.Blacklist =>
|
||||||
true
|
true
|
||||||
|
Loading…
x
Reference in New Issue
Block a user