mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
Sort by weights better and save them
This commit is contained in:
parent
5d6658770e
commit
3e10e2175a
@ -7,7 +7,19 @@ import docspell.common.MetaProposal.Candidate
|
|||||||
import io.circe._
|
import io.circe._
|
||||||
import io.circe.generic.semiauto._
|
import io.circe.generic.semiauto._
|
||||||
import java.time.LocalDate
|
import java.time.LocalDate
|
||||||
|
import cats.kernel.Order
|
||||||
|
|
||||||
|
/** A proposed meta data to an item.
|
||||||
|
*
|
||||||
|
* There is only one value for each proposal type. The list of
|
||||||
|
* candidates is meant to be ordered from the best match to the
|
||||||
|
* lowest match.
|
||||||
|
*
|
||||||
|
* The candidate is already "resolved" against the database and
|
||||||
|
* contains a valid record (with its ID and a human readable name).
|
||||||
|
* Additionally it carries a set of "labels" (which may be empty)
|
||||||
|
* that are the source of this candidate.
|
||||||
|
*/
|
||||||
case class MetaProposal(proposalType: MetaProposalType, values: NonEmptyList[Candidate]) {
|
case class MetaProposal(proposalType: MetaProposalType, values: NonEmptyList[Candidate]) {
|
||||||
|
|
||||||
def addIdRef(refs: Seq[Candidate]): MetaProposal =
|
def addIdRef(refs: Seq[Candidate]): MetaProposal =
|
||||||
@ -21,6 +33,12 @@ case class MetaProposal(proposalType: MetaProposalType, values: NonEmptyList[Can
|
|||||||
|
|
||||||
def size: Int =
|
def size: Int =
|
||||||
values.size
|
values.size
|
||||||
|
|
||||||
|
def addWeights(wf: Candidate => Double): MetaProposal =
|
||||||
|
MetaProposal(proposalType, values.map(c => c.withWeight(wf(c))))
|
||||||
|
|
||||||
|
def sortByWeight: MetaProposal =
|
||||||
|
MetaProposal(proposalType, values.sortBy(_.weight)(Candidate.weightOrder))
|
||||||
}
|
}
|
||||||
|
|
||||||
object MetaProposal {
|
object MetaProposal {
|
||||||
@ -31,23 +49,51 @@ object MetaProposal {
|
|||||||
def parseDate(date: Ident): Option[LocalDate] =
|
def parseDate(date: Ident): Option[LocalDate] =
|
||||||
Either.catchNonFatal(LocalDate.parse(date.id)).toOption
|
Either.catchNonFatal(LocalDate.parse(date.id)).toOption
|
||||||
|
|
||||||
case class Candidate(ref: IdRef, origin: Set[NerLabel])
|
case class Candidate(ref: IdRef, origin: Set[NerLabel], weight: Option[Double] = None) {
|
||||||
|
def withWeight(w: Double): Candidate =
|
||||||
|
copy(weight = Some(w))
|
||||||
|
}
|
||||||
|
|
||||||
object Candidate {
|
object Candidate {
|
||||||
implicit val jsonEncoder: Encoder[Candidate] =
|
implicit val jsonEncoder: Encoder[Candidate] =
|
||||||
deriveEncoder[Candidate]
|
deriveEncoder[Candidate]
|
||||||
implicit val jsonDecoder: Decoder[Candidate] =
|
implicit val jsonDecoder: Decoder[Candidate] =
|
||||||
deriveDecoder[Candidate]
|
deriveDecoder[Candidate]
|
||||||
|
|
||||||
|
/** This deviates from standard order to sort None at last.
|
||||||
|
*/
|
||||||
|
val weightOrder: Order[Option[Double]] = new Order[Option[Double]] {
|
||||||
|
def compare(x: Option[Double], y: Option[Double]) =
|
||||||
|
(x, y) match {
|
||||||
|
case (None, None) => 0
|
||||||
|
case (None, _) => 1
|
||||||
|
case (_, None) => -1
|
||||||
|
case (Some(x), Some(y)) => Order[Double].compare(x, y)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Merges candidates with same `IdRef' values and concatenates their
|
||||||
|
* respective labels. The candidate order is preserved.
|
||||||
|
*/
|
||||||
def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = {
|
def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = {
|
||||||
def append(list: List[Candidate]): Candidate =
|
def mergeInto(
|
||||||
list.reduce((l0, l1) => l0.copy(origin = l0.origin ++ l1.origin))
|
res: NonEmptyList[Candidate],
|
||||||
val grouped = s.toList.groupBy(_.ref.id)
|
el: Candidate
|
||||||
NonEmptyList.fromListUnsafe(grouped.values.toList.map(append))
|
): NonEmptyList[Candidate] = {
|
||||||
|
val l = res.map(c =>
|
||||||
|
if (c.ref.id == el.ref.id) c.copy(origin = c.origin ++ el.origin) else c
|
||||||
|
)
|
||||||
|
if (l == res) l :+ el
|
||||||
|
else l
|
||||||
|
}
|
||||||
|
val init = NonEmptyList.of(s.head)
|
||||||
|
s.tail.foldLeft(init)(mergeInto)
|
||||||
}
|
}
|
||||||
|
|
||||||
implicit val jsonDecoder: Decoder[MetaProposal] =
|
implicit val jsonDecoder: Decoder[MetaProposal] =
|
||||||
deriveDecoder[MetaProposal]
|
deriveDecoder[MetaProposal]
|
||||||
implicit val jsonEncoder: Encoder[MetaProposal] =
|
implicit val jsonEncoder: Encoder[MetaProposal] =
|
||||||
deriveEncoder[MetaProposal]
|
deriveEncoder[MetaProposal]
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,10 @@ import docspell.common.MetaProposal.Candidate
|
|||||||
import io.circe._
|
import io.circe._
|
||||||
import io.circe.generic.semiauto._
|
import io.circe.generic.semiauto._
|
||||||
|
|
||||||
|
/** A list of proposals for meta data to an item.
|
||||||
|
*
|
||||||
|
* The list usually keeps only one value for each `MetaProposalType'.
|
||||||
|
*/
|
||||||
case class MetaProposalList private (proposals: List[MetaProposal]) {
|
case class MetaProposalList private (proposals: List[MetaProposal]) {
|
||||||
|
|
||||||
def isEmpty: Boolean = proposals.isEmpty
|
def isEmpty: Boolean = proposals.isEmpty
|
||||||
@ -31,6 +35,11 @@ case class MetaProposalList private (proposals: List[MetaProposal]) {
|
|||||||
def find(mpt: MetaProposalType): Option[MetaProposal] =
|
def find(mpt: MetaProposalType): Option[MetaProposal] =
|
||||||
proposals.find(_.proposalType == mpt)
|
proposals.find(_.proposalType == mpt)
|
||||||
|
|
||||||
|
def change(f: MetaProposal => MetaProposal): MetaProposalList =
|
||||||
|
new MetaProposalList(proposals.map(f))
|
||||||
|
|
||||||
|
def sortByWeights: MetaProposalList =
|
||||||
|
change(_.sortByWeight)
|
||||||
}
|
}
|
||||||
|
|
||||||
object MetaProposalList {
|
object MetaProposalList {
|
||||||
@ -54,6 +63,12 @@ object MetaProposalList {
|
|||||||
def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList =
|
def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList =
|
||||||
new MetaProposalList(m.toList.map({ case (k, v) => v.copy(proposalType = k) }))
|
new MetaProposalList(m.toList.map({ case (k, v) => v.copy(proposalType = k) }))
|
||||||
|
|
||||||
|
/** Flattens the given list of meta-proposals into a single list,
|
||||||
|
* where each meta-proposal type exists at most once. Candidates to
|
||||||
|
* equal proposal-types are merged together. The candidate's order
|
||||||
|
* is preserved and candidates of proposals are appended as given
|
||||||
|
* by the order of the given `seq'.
|
||||||
|
*/
|
||||||
def flatten(ml: Seq[MetaProposalList]): MetaProposalList = {
|
def flatten(ml: Seq[MetaProposalList]): MetaProposalList = {
|
||||||
val init: Map[MetaProposalType, MetaProposal] = Map.empty
|
val init: Map[MetaProposalType, MetaProposal] = Map.empty
|
||||||
|
|
||||||
|
@ -0,0 +1,67 @@
|
|||||||
|
package docspell.common
|
||||||
|
|
||||||
|
import minitest._
|
||||||
|
import cats.data.NonEmptyList
|
||||||
|
import docspell.common.MetaProposal.Candidate
|
||||||
|
|
||||||
|
object MetaProposalListTest extends SimpleTestSuite {
|
||||||
|
|
||||||
|
test("flatten retains order of candidates") {
|
||||||
|
val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty)
|
||||||
|
val mpl1 = MetaProposalList.of(
|
||||||
|
MetaProposal(
|
||||||
|
MetaProposalType.CorrOrg,
|
||||||
|
NonEmptyList.of(cand1)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty)
|
||||||
|
val mpl2 = MetaProposalList.of(
|
||||||
|
MetaProposal(
|
||||||
|
MetaProposalType.CorrOrg,
|
||||||
|
NonEmptyList.of(cand2)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
val candidates1 = MetaProposalList
|
||||||
|
.flatten(Seq(mpl1, mpl2))
|
||||||
|
.find(MetaProposalType.CorrOrg)
|
||||||
|
.get
|
||||||
|
.values
|
||||||
|
assertEquals(candidates1.head, cand1)
|
||||||
|
assertEquals(candidates1.tail.head, cand2)
|
||||||
|
|
||||||
|
val candidates2 = MetaProposalList
|
||||||
|
.flatten(Seq(mpl2, mpl1))
|
||||||
|
.find(MetaProposalType.CorrOrg)
|
||||||
|
.get
|
||||||
|
.values
|
||||||
|
assertEquals(candidates2.head, cand2)
|
||||||
|
assertEquals(candidates2.tail.head, cand1)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("sort by weights") {
|
||||||
|
val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty, Some(0.1))
|
||||||
|
val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty, Some(0.05))
|
||||||
|
val mpl = MetaProposalList.of(
|
||||||
|
MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)),
|
||||||
|
MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2))
|
||||||
|
).sortByWeights
|
||||||
|
|
||||||
|
val candidates = mpl.find(MetaProposalType.CorrOrg).get.values
|
||||||
|
assertEquals(candidates.head, cand2)
|
||||||
|
assertEquals(candidates.tail.head, cand1)
|
||||||
|
}
|
||||||
|
|
||||||
|
test("sort by weights: unset is last") {
|
||||||
|
val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty, Some(0.1))
|
||||||
|
val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty)
|
||||||
|
val mpl = MetaProposalList.of(
|
||||||
|
MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)),
|
||||||
|
MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2))
|
||||||
|
).sortByWeights
|
||||||
|
|
||||||
|
val candidates = mpl.find(MetaProposalType.CorrOrg).get.values
|
||||||
|
assertEquals(candidates.head, cand1)
|
||||||
|
assertEquals(candidates.tail.head, cand2)
|
||||||
|
}
|
||||||
|
}
|
@ -73,7 +73,8 @@ object CreateItem {
|
|||||||
fm,
|
fm,
|
||||||
Vector.empty,
|
Vector.empty,
|
||||||
Vector.empty,
|
Vector.empty,
|
||||||
fm.map(a => a.id -> a.fileId).toMap
|
fm.map(a => a.id -> a.fileId).toMap,
|
||||||
|
MetaProposalList.empty
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,7 +111,7 @@ object CreateItem {
|
|||||||
.map(originFileTuple)
|
.map(originFileTuple)
|
||||||
.toMap
|
.toMap
|
||||||
} yield cand.headOption.map(ri =>
|
} yield cand.headOption.map(ri =>
|
||||||
ItemData(ri, rms, Vector.empty, Vector.empty, origMap)
|
ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,8 @@ import docspell.common._
|
|||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.store.records.RAttachmentMeta
|
import docspell.store.records.RAttachmentMeta
|
||||||
|
|
||||||
/** Reorders the proposals to put most probable fits first.
|
/** Calculate weights for candidates that adds the most likely
|
||||||
|
* candidate a lower number.
|
||||||
*/
|
*/
|
||||||
object EvalProposals {
|
object EvalProposals {
|
||||||
|
|
||||||
@ -16,24 +17,14 @@ object EvalProposals {
|
|||||||
Timestamp
|
Timestamp
|
||||||
.current[F]
|
.current[F]
|
||||||
.map { now =>
|
.map { now =>
|
||||||
val metas = data.metas.map(reorderCandidates(now.toUtcDate))
|
val metas = data.metas.map(calcCandidateWeight(now.toUtcDate))
|
||||||
data.copy(metas = metas)
|
data.copy(metas = metas)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def reorderCandidates(now: LocalDate)(rm: RAttachmentMeta): RAttachmentMeta = {
|
def calcCandidateWeight(now: LocalDate)(rm: RAttachmentMeta): RAttachmentMeta = {
|
||||||
val list = rm.proposals.getTypes.toList
|
val list = rm.proposals.change(mp => mp.addWeights(weight(rm, mp, now)))
|
||||||
.map(mpt =>
|
rm.copy(proposals = list.sortByWeights)
|
||||||
rm.proposals.find(mpt) match {
|
|
||||||
case Some(mp) =>
|
|
||||||
val v = mp.values.sortBy(weight(rm, mp, now))
|
|
||||||
Some(mp.copy(values = v))
|
|
||||||
case None =>
|
|
||||||
None
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def weight(rm: RAttachmentMeta, mp: MetaProposal, ref: LocalDate)(
|
def weight(rm: RAttachmentMeta, mp: MetaProposal, ref: LocalDate)(
|
||||||
|
@ -1,15 +1,18 @@
|
|||||||
package docspell.joex.process
|
package docspell.joex.process
|
||||||
|
|
||||||
import docspell.common.{Ident, NerDateLabel, NerLabel}
|
import docspell.common._
|
||||||
import docspell.joex.process.ItemData.AttachmentDates
|
import docspell.joex.process.ItemData.AttachmentDates
|
||||||
import docspell.store.records.{RAttachment, RAttachmentMeta, RItem}
|
import docspell.store.records.{RAttachment, RAttachmentMeta, RItem}
|
||||||
|
|
||||||
|
/** Data that is carried across all processing tasks.
|
||||||
|
*/
|
||||||
case class ItemData(
|
case class ItemData(
|
||||||
item: RItem,
|
item: RItem,
|
||||||
attachments: Vector[RAttachment],
|
attachments: Vector[RAttachment],
|
||||||
metas: Vector[RAttachmentMeta],
|
metas: Vector[RAttachmentMeta],
|
||||||
dateLabels: Vector[AttachmentDates],
|
dateLabels: Vector[AttachmentDates],
|
||||||
originFile: Map[Ident, Ident] //maps RAttachment.id -> FileMeta.id
|
originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
|
||||||
|
givenMeta: MetaProposalList // given meta data not associated to a specific attachment
|
||||||
) {
|
) {
|
||||||
|
|
||||||
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
def findMeta(attachId: Ident): Option[RAttachmentMeta] =
|
||||||
|
@ -10,7 +10,9 @@ object LinkProposal {
|
|||||||
|
|
||||||
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
val proposals = MetaProposalList.flatten(data.metas.map(_.proposals))
|
// sort by weight; order of equal weights is not important, just
|
||||||
|
// choose one others are then suggestions
|
||||||
|
val proposals = MetaProposalList.flatten(data.metas.map(_.proposals)).sortByWeights
|
||||||
|
|
||||||
ctx.logger.info(s"Starting linking proposals") *>
|
ctx.logger.info(s"Starting linking proposals") *>
|
||||||
MetaProposalType.all
|
MetaProposalType.all
|
||||||
@ -24,7 +26,7 @@ object LinkProposal {
|
|||||||
proposalList: MetaProposalList,
|
proposalList: MetaProposalList,
|
||||||
ctx: Context[F, ProcessItemArgs]
|
ctx: Context[F, ProcessItemArgs]
|
||||||
)(mpt: MetaProposalType): F[Result] =
|
)(mpt: MetaProposalType): F[Result] =
|
||||||
proposalList.find(mpt) match {
|
data.givenMeta.find(mpt).orElse(proposalList.find(mpt)) match {
|
||||||
case None =>
|
case None =>
|
||||||
Result.noneFound(mpt).pure[F]
|
Result.noneFound(mpt).pure[F]
|
||||||
case Some(a) if a.isSingleValue =>
|
case Some(a) if a.isSingleValue =>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user