From 3e10e2175a1f651711669ea05a7ca73f638ed0a7 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Tue, 12 May 2020 01:13:22 +0200 Subject: [PATCH] Sort by weights better and save them --- .../scala/docspell/common/MetaProposal.scala | 56 ++++++++++++++-- .../docspell/common/MetaProposalList.scala | 15 +++++ .../common/MetaProposalListTest.scala | 67 +++++++++++++++++++ .../docspell/joex/process/CreateItem.scala | 5 +- .../docspell/joex/process/EvalProposals.scala | 21 ++---- .../docspell/joex/process/ItemData.scala | 7 +- .../docspell/joex/process/LinkProposal.scala | 6 +- 7 files changed, 151 insertions(+), 26 deletions(-) create mode 100644 modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala diff --git a/modules/common/src/main/scala/docspell/common/MetaProposal.scala b/modules/common/src/main/scala/docspell/common/MetaProposal.scala index f7c34955..d3d40012 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposal.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposal.scala @@ -7,7 +7,19 @@ import docspell.common.MetaProposal.Candidate import io.circe._ import io.circe.generic.semiauto._ import java.time.LocalDate +import cats.kernel.Order +/** A proposed meta data to an item. + * + * There is only one value for each proposal type. The list of + * candidates is meant to be ordered from the best match to the + * lowest match. + * + * The candidate is already "resolved" against the database and + * contains a valid record (with its ID and a human readable name). + * Additionally it carries a set of "labels" (which may be empty) + * that are the source of this candidate. + */ case class MetaProposal(proposalType: MetaProposalType, values: NonEmptyList[Candidate]) { def addIdRef(refs: Seq[Candidate]): MetaProposal = @@ -21,6 +33,12 @@ case class MetaProposal(proposalType: MetaProposalType, values: NonEmptyList[Can def size: Int = values.size + + def addWeights(wf: Candidate => Double): MetaProposal = + MetaProposal(proposalType, values.map(c => c.withWeight(wf(c)))) + + def sortByWeight: MetaProposal = + MetaProposal(proposalType, values.sortBy(_.weight)(Candidate.weightOrder)) } object MetaProposal { @@ -31,23 +49,51 @@ object MetaProposal { def parseDate(date: Ident): Option[LocalDate] = Either.catchNonFatal(LocalDate.parse(date.id)).toOption - case class Candidate(ref: IdRef, origin: Set[NerLabel]) + case class Candidate(ref: IdRef, origin: Set[NerLabel], weight: Option[Double] = None) { + def withWeight(w: Double): Candidate = + copy(weight = Some(w)) + } + object Candidate { implicit val jsonEncoder: Encoder[Candidate] = deriveEncoder[Candidate] implicit val jsonDecoder: Decoder[Candidate] = deriveDecoder[Candidate] + + /** This deviates from standard order to sort None at last. + */ + val weightOrder: Order[Option[Double]] = new Order[Option[Double]] { + def compare(x: Option[Double], y: Option[Double]) = + (x, y) match { + case (None, None) => 0 + case (None, _) => 1 + case (_, None) => -1 + case (Some(x), Some(y)) => Order[Double].compare(x, y) + } + } } + /** Merges candidates with same `IdRef' values and concatenates their + * respective labels. The candidate order is preserved. + */ def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = { - def append(list: List[Candidate]): Candidate = - list.reduce((l0, l1) => l0.copy(origin = l0.origin ++ l1.origin)) - val grouped = s.toList.groupBy(_.ref.id) - NonEmptyList.fromListUnsafe(grouped.values.toList.map(append)) + def mergeInto( + res: NonEmptyList[Candidate], + el: Candidate + ): NonEmptyList[Candidate] = { + val l = res.map(c => + if (c.ref.id == el.ref.id) c.copy(origin = c.origin ++ el.origin) else c + ) + if (l == res) l :+ el + else l + } + val init = NonEmptyList.of(s.head) + s.tail.foldLeft(init)(mergeInto) } implicit val jsonDecoder: Decoder[MetaProposal] = deriveDecoder[MetaProposal] implicit val jsonEncoder: Encoder[MetaProposal] = deriveEncoder[MetaProposal] + } diff --git a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala index 79473e1b..d72f5f85 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala @@ -6,6 +6,10 @@ import docspell.common.MetaProposal.Candidate import io.circe._ import io.circe.generic.semiauto._ +/** A list of proposals for meta data to an item. + * + * The list usually keeps only one value for each `MetaProposalType'. + */ case class MetaProposalList private (proposals: List[MetaProposal]) { def isEmpty: Boolean = proposals.isEmpty @@ -31,6 +35,11 @@ case class MetaProposalList private (proposals: List[MetaProposal]) { def find(mpt: MetaProposalType): Option[MetaProposal] = proposals.find(_.proposalType == mpt) + def change(f: MetaProposal => MetaProposal): MetaProposalList = + new MetaProposalList(proposals.map(f)) + + def sortByWeights: MetaProposalList = + change(_.sortByWeight) } object MetaProposalList { @@ -54,6 +63,12 @@ object MetaProposalList { def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList = new MetaProposalList(m.toList.map({ case (k, v) => v.copy(proposalType = k) })) + /** Flattens the given list of meta-proposals into a single list, + * where each meta-proposal type exists at most once. Candidates to + * equal proposal-types are merged together. The candidate's order + * is preserved and candidates of proposals are appended as given + * by the order of the given `seq'. + */ def flatten(ml: Seq[MetaProposalList]): MetaProposalList = { val init: Map[MetaProposalType, MetaProposal] = Map.empty diff --git a/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala b/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala new file mode 100644 index 00000000..a8ffde51 --- /dev/null +++ b/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala @@ -0,0 +1,67 @@ +package docspell.common + +import minitest._ +import cats.data.NonEmptyList +import docspell.common.MetaProposal.Candidate + +object MetaProposalListTest extends SimpleTestSuite { + + test("flatten retains order of candidates") { + val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty) + val mpl1 = MetaProposalList.of( + MetaProposal( + MetaProposalType.CorrOrg, + NonEmptyList.of(cand1) + ) + ) + val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty) + val mpl2 = MetaProposalList.of( + MetaProposal( + MetaProposalType.CorrOrg, + NonEmptyList.of(cand2) + ) + ) + + val candidates1 = MetaProposalList + .flatten(Seq(mpl1, mpl2)) + .find(MetaProposalType.CorrOrg) + .get + .values + assertEquals(candidates1.head, cand1) + assertEquals(candidates1.tail.head, cand2) + + val candidates2 = MetaProposalList + .flatten(Seq(mpl2, mpl1)) + .find(MetaProposalType.CorrOrg) + .get + .values + assertEquals(candidates2.head, cand2) + assertEquals(candidates2.tail.head, cand1) + } + + test("sort by weights") { + val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty, Some(0.1)) + val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty, Some(0.05)) + val mpl = MetaProposalList.of( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)), + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2)) + ).sortByWeights + + val candidates = mpl.find(MetaProposalType.CorrOrg).get.values + assertEquals(candidates.head, cand2) + assertEquals(candidates.tail.head, cand1) + } + + test("sort by weights: unset is last") { + val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty, Some(0.1)) + val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty) + val mpl = MetaProposalList.of( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)), + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2)) + ).sortByWeights + + val candidates = mpl.find(MetaProposalType.CorrOrg).get.values + assertEquals(candidates.head, cand1) + assertEquals(candidates.tail.head, cand2) + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index 798fb6f5..595c0b1b 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -73,7 +73,8 @@ object CreateItem { fm, Vector.empty, Vector.empty, - fm.map(a => a.id -> a.fileId).toMap + fm.map(a => a.id -> a.fileId).toMap, + MetaProposalList.empty ) } @@ -110,7 +111,7 @@ object CreateItem { .map(originFileTuple) .toMap } yield cand.headOption.map(ri => - ItemData(ri, rms, Vector.empty, Vector.empty, origMap) + ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty) ) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala b/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala index 6e492a07..cf5ecb1c 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/EvalProposals.scala @@ -7,7 +7,8 @@ import docspell.common._ import docspell.joex.scheduler.Task import docspell.store.records.RAttachmentMeta -/** Reorders the proposals to put most probable fits first. +/** Calculate weights for candidates that adds the most likely + * candidate a lower number. */ object EvalProposals { @@ -16,24 +17,14 @@ object EvalProposals { Timestamp .current[F] .map { now => - val metas = data.metas.map(reorderCandidates(now.toUtcDate)) + val metas = data.metas.map(calcCandidateWeight(now.toUtcDate)) data.copy(metas = metas) } } - def reorderCandidates(now: LocalDate)(rm: RAttachmentMeta): RAttachmentMeta = { - val list = rm.proposals.getTypes.toList - .map(mpt => - rm.proposals.find(mpt) match { - case Some(mp) => - val v = mp.values.sortBy(weight(rm, mp, now)) - Some(mp.copy(values = v)) - case None => - None - } - ) - - rm.copy(proposals = MetaProposalList(list.flatMap(identity))) + def calcCandidateWeight(now: LocalDate)(rm: RAttachmentMeta): RAttachmentMeta = { + val list = rm.proposals.change(mp => mp.addWeights(weight(rm, mp, now))) + rm.copy(proposals = list.sortByWeights) } def weight(rm: RAttachmentMeta, mp: MetaProposal, ref: LocalDate)( diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index 48dada99..7d8e7729 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -1,15 +1,18 @@ package docspell.joex.process -import docspell.common.{Ident, NerDateLabel, NerLabel} +import docspell.common._ import docspell.joex.process.ItemData.AttachmentDates import docspell.store.records.{RAttachment, RAttachmentMeta, RItem} +/** Data that is carried across all processing tasks. + */ case class ItemData( item: RItem, attachments: Vector[RAttachment], metas: Vector[RAttachmentMeta], dateLabels: Vector[AttachmentDates], - originFile: Map[Ident, Ident] //maps RAttachment.id -> FileMeta.id + originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id + givenMeta: MetaProposalList // given meta data not associated to a specific attachment ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = diff --git a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala index 9416b386..ca875215 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala @@ -10,7 +10,9 @@ object LinkProposal { def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = Task { ctx => - val proposals = MetaProposalList.flatten(data.metas.map(_.proposals)) + // sort by weight; order of equal weights is not important, just + // choose one others are then suggestions + val proposals = MetaProposalList.flatten(data.metas.map(_.proposals)).sortByWeights ctx.logger.info(s"Starting linking proposals") *> MetaProposalType.all @@ -24,7 +26,7 @@ object LinkProposal { proposalList: MetaProposalList, ctx: Context[F, ProcessItemArgs] )(mpt: MetaProposalType): F[Result] = - proposalList.find(mpt) match { + data.givenMeta.find(mpt).orElse(proposalList.find(mpt)) match { case None => Result.noneFound(mpt).pure[F] case Some(a) if a.isSingleValue =>