From eb308cfa85fc5daff31164cae1f33ac8561ea551 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 15 Feb 2021 21:05:34 +0100 Subject: [PATCH 1/2] Remove duplicate candidates when creating list of suggestions --- .../main/scala/docspell/common/IdRef.scala | 5 ++++ .../main/scala/docspell/common/Ident.scala | 4 +++ .../scala/docspell/common/MetaProposal.scala | 5 +++- .../docspell/common/MetaProposalList.scala | 3 +- .../common/MetaProposalListTest.scala | 30 +++++++++++++++++++ 5 files changed, 45 insertions(+), 2 deletions(-) diff --git a/modules/common/src/main/scala/docspell/common/IdRef.scala b/modules/common/src/main/scala/docspell/common/IdRef.scala index 8c32405c..918030e7 100644 --- a/modules/common/src/main/scala/docspell/common/IdRef.scala +++ b/modules/common/src/main/scala/docspell/common/IdRef.scala @@ -1,5 +1,7 @@ package docspell.common +import cats.Order + import io.circe._ import io.circe.generic.semiauto._ @@ -11,4 +13,7 @@ object IdRef { deriveEncoder[IdRef] implicit val jsonDecoder: Decoder[IdRef] = deriveDecoder[IdRef] + + implicit val order: Order[IdRef] = + Order.by(_.id) } diff --git a/modules/common/src/main/scala/docspell/common/Ident.scala b/modules/common/src/main/scala/docspell/common/Ident.scala index dcb8cf2d..95e58bc6 100644 --- a/modules/common/src/main/scala/docspell/common/Ident.scala +++ b/modules/common/src/main/scala/docspell/common/Ident.scala @@ -6,6 +6,7 @@ import java.util.UUID import cats.Eq import cats.effect.Sync import cats.implicits._ +import cats.Order import io.circe.{Decoder, Encoder} import scodec.bits.ByteVector @@ -66,4 +67,7 @@ object Ident { implicit val decodeIdent: Decoder[Ident] = Decoder.decodeString.emap(Ident.fromString) + implicit val order: Order[Ident] = + Order.by(_.id) + } diff --git a/modules/common/src/main/scala/docspell/common/MetaProposal.scala b/modules/common/src/main/scala/docspell/common/MetaProposal.scala index 62a9355f..36f4be1c 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposal.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposal.scala @@ -2,9 +2,9 @@ package docspell.common import java.time.LocalDate +import cats.Order import cats.data.NonEmptyList import cats.implicits._ -import cats.kernel.Order import docspell.common.MetaProposal.Candidate import docspell.common._ @@ -74,6 +74,9 @@ object MetaProposal { implicit val jsonDecoder: Decoder[Candidate] = deriveDecoder[Candidate] + implicit val order: Order[Candidate] = + Order.by(_.ref) + /** This deviates from standard order to sort None at last. */ val weightOrder: Order[Option[Double]] = new Order[Option[Double]] { diff --git a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala index 04cedb30..33865484 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala @@ -52,7 +52,8 @@ case class MetaProposalList private (proposals: List[MetaProposal]) { (map, next) => map.get(next.proposalType) match { case Some(MetaProposal(mt, values)) => - val cand = NonEmptyList(values.head, next.values.toList ++ values.tail) + val cand = + NonEmptyList(values.head, next.values.toList ++ values.tail).distinct map.updated(next.proposalType, MetaProposal(mt, MetaProposal.flatten(cand))) case None => map.updated(next.proposalType, next) diff --git a/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala b/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala index 44a6cfc2..c35bafcc 100644 --- a/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala +++ b/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala @@ -99,4 +99,34 @@ object MetaProposalListTest extends SimpleTestSuite { ) ) } + + test("insert second, remove duplicates") { + val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty) + val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty) + val cand3 = Candidate(IdRef(Ident.unsafe("789"), "name"), Set.empty) + val cand5 = Candidate(IdRef(Ident.unsafe("def"), "name"), Set.empty) + + val mpl1 = MetaProposalList + .of( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1, cand2)), + MetaProposal(MetaProposalType.ConcPerson, NonEmptyList.of(cand3)) + ) + + val mpl2 = MetaProposalList + .of( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)), + MetaProposal(MetaProposalType.ConcPerson, NonEmptyList.of(cand5)) + ) + + val result = mpl1.insertSecond(mpl2) + assertEquals( + result, + MetaProposalList( + List( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1, cand2)), + MetaProposal(MetaProposalType.ConcPerson, NonEmptyList.of(cand3, cand5)) + ) + ) + ) + } } From d99ce76d89533f346eda35974d0f02388281fe3c Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Mon, 15 Feb 2021 22:52:08 +0100 Subject: [PATCH 2/2] Remove person suggestion if it doesn't match with organization --- .../docspell/common/MetaProposalList.scala | 5 ++ .../joex/process/CrossCheckProposals.scala | 68 +++++++++++++++++++ .../docspell/joex/process/EvalProposals.scala | 4 +- .../docspell/joex/process/ItemData.scala | 11 +++ .../docspell/joex/process/LinkProposal.scala | 9 +-- .../docspell/joex/process/ProcessItem.scala | 1 + 6 files changed, 89 insertions(+), 9 deletions(-) create mode 100644 modules/joex/src/main/scala/docspell/joex/process/CrossCheckProposals.scala diff --git a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala index 33865484..1f34648d 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala @@ -40,6 +40,11 @@ case class MetaProposalList private (proposals: List[MetaProposal]) { def change(f: MetaProposal => MetaProposal): MetaProposalList = new MetaProposalList(proposals.map(f)) + def replace(mp: MetaProposal): MetaProposalList = { + val next = proposals.filter(_.proposalType != mp.proposalType) + MetaProposalList(mp :: next) + } + def filter(f: MetaProposal => Boolean): MetaProposalList = new MetaProposalList(proposals.filter(f)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/CrossCheckProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/CrossCheckProposals.scala new file mode 100644 index 00000000..06c85ec0 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/CrossCheckProposals.scala @@ -0,0 +1,68 @@ +package docspell.joex.process + +import cats.data.NonEmptyList +import cats.data.OptionT +import cats.effect.Sync +import cats.implicits._ + +import docspell.common._ +import docspell.joex.scheduler.Task + +/** After candidates have been determined, the set is reduced by doing + * some cross checks. For example: if a organization is suggested as + * correspondent, the correspondent person must be linked to that + * organization. So this *removes all* person candidates that are not + * linked to the first organization candidate (which will be linked + * to the item). + */ +object CrossCheckProposals { + + def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + val proposals = data.finalProposals + val corrOrg = proposals.find(MetaProposalType.CorrOrg) + (for { + orgRef <- OptionT.fromOption[F](corrOrg) + persRefs <- OptionT.liftF(EvalProposals.findOrganizationRelation(data, ctx)) + clProps <- OptionT.liftF( + personOrgCheck[F](ctx.logger, data.classifyProposals, persRefs)(orgRef) + ) + atProps <- OptionT.liftF { + data.metas.traverse(ra => + personOrgCheck[F](ctx.logger, ra.proposals, persRefs)(orgRef).map(nl => + ra.copy(proposals = nl) + ) + ) + } + } yield data.copy(classifyProposals = clProps, metas = atProps)).getOrElse(data) + } + + def personOrgCheck[F[_]: Sync]( + logger: Logger[F], + mpl: MetaProposalList, + persRefs: Map[Ident, PersonRef] + )( + corrOrg: MetaProposal + ): F[MetaProposalList] = { + val orgId = corrOrg.values.head.ref.id + mpl.find(MetaProposalType.CorrPerson) match { + case Some(ppl) => + val list = ppl.values.filter(c => + persRefs.get(c.ref.id).exists(_.organization == Some(orgId)) + ) + + if (ppl.values.toList == list) mpl.pure[F] + else + logger.debug( + "Removing person suggestion, because it doesn't match organization" + ) *> NonEmptyList + .fromList(list) + .map(nel => mpl.replace(MetaProposal(ppl.proposalType, nel))) + .getOrElse(mpl.filter(_.proposalType != ppl.proposalType)) + .pure[F] + + case None => + mpl.pure[F] + } + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala index 772e9c03..8846b27a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala @@ -28,7 +28,9 @@ object EvalProposals { ctx: Context[F, _] ): F[Map[Ident, PersonRef]] = { val corrPersIds = data.metas - .flatMap(_.proposals.find(MetaProposalType.CorrPerson)) + .map(_.proposals) + .appended(data.classifyProposals) + .flatMap(_.find(MetaProposalType.CorrPerson)) .flatMap(_.values.toList.map(_.ref.id)) .toSet ctx.store diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index f7f52fe5..f19a4244 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -33,6 +33,17 @@ case class ItemData( classifyTags: List[String] ) { + /** sort by weight; order of equal weights is not important, just + * choose one others are then suggestions + * doc-date is only set when given explicitely, not from "guessing" + */ + def finalProposals: MetaProposalList = + MetaProposalList + .flatten(metas.map(_.proposals)) + .filter(_.proposalType != MetaProposalType.DocDate) + .sortByWeights + .fillEmptyFrom(classifyProposals) + def findMeta(attachId: Ident): Option[RAttachmentMeta] = metas.find(_.id == attachId) diff --git a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala index be8d34c8..6fa15978 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala @@ -17,14 +17,7 @@ object LinkProposal { .map(_ => data) else Task { ctx => - // sort by weight; order of equal weights is not important, just - // choose one others are then suggestions - // doc-date is only set when given explicitely, not from "guessing" - val proposals = MetaProposalList - .flatten(data.metas.map(_.proposals)) - .filter(_.proposalType != MetaProposalType.DocDate) - .sortByWeights - .fillEmptyFrom(data.classifyProposals) + val proposals = data.finalProposals ctx.logger.info(s"Starting linking proposals") *> MetaProposalType.all diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index c119b467..1ba548de 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -43,6 +43,7 @@ object ProcessItem { TextAnalysis[F](cfg.textAnalysis, analyser, regexNer)(item) .flatMap(FindProposal[F](cfg.textAnalysis)) .flatMap(EvalProposals[F]) + .flatMap(CrossCheckProposals[F]) .flatMap(SaveProposals[F]) private def processAttachments0[F[_]: ConcurrentEffect: ContextShift](