Merge pull request #123 from eikek/use-given-metadata

Use given metadata
This commit is contained in:
eikek 2020-05-17 12:11:38 +02:00 committed by GitHub
commit 8a56cf0801
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 228 additions and 36 deletions

View File

@ -34,7 +34,7 @@ object DateFind {
private case class SimpleDate(year: Int, month: Int, day: Int) { private case class SimpleDate(year: Int, month: Int, day: Int) {
def toLocalDate: LocalDate = def toLocalDate: LocalDate =
LocalDate.of(if (year < 100) 1900 + year else year, month, day) LocalDate.of(if (year < 100) 2000 + year else year, month, day)
} }
private object SimpleDate { private object SimpleDate {

View File

@ -7,7 +7,19 @@ import docspell.common.MetaProposal.Candidate
import io.circe._ import io.circe._
import io.circe.generic.semiauto._ import io.circe.generic.semiauto._
import java.time.LocalDate import java.time.LocalDate
import cats.kernel.Order
/** A proposed meta data to an item.
*
* There is only one value for each proposal type. The list of
* candidates is meant to be ordered from the best match to the
* lowest match.
*
* The candidate is already "resolved" against the database and
* contains a valid record (with its ID and a human readable name).
* Additionally it carries a set of "labels" (which may be empty)
* that are the source of this candidate.
*/
case class MetaProposal(proposalType: MetaProposalType, values: NonEmptyList[Candidate]) { case class MetaProposal(proposalType: MetaProposalType, values: NonEmptyList[Candidate]) {
def addIdRef(refs: Seq[Candidate]): MetaProposal = def addIdRef(refs: Seq[Candidate]): MetaProposal =
@ -21,33 +33,78 @@ case class MetaProposal(proposalType: MetaProposalType, values: NonEmptyList[Can
def size: Int = def size: Int =
values.size values.size
def addWeights(wf: Candidate => Double): MetaProposal =
MetaProposal(proposalType, values.map(c => c.withWeight(wf(c))))
def sortByWeight: MetaProposal =
MetaProposal(proposalType, values.sortBy(_.weight)(Candidate.weightOrder))
} }
object MetaProposal { object MetaProposal {
def apply(pt: MetaProposalType, v0: Candidate, vm: Candidate*): MetaProposal =
MetaProposal(pt, NonEmptyList.of(v0, vm: _*))
def docDate(ts: Timestamp, origin: Option[NerLabel]): MetaProposal = {
val label = ts.toUtcDate.toString
MetaProposal(
MetaProposalType.DocDate,
Candidate(IdRef(Ident.unsafe(label), label), origin.toSet)
)
}
def parseDate(cand: Candidate): Option[LocalDate] = def parseDate(cand: Candidate): Option[LocalDate] =
parseDate(cand.ref.id) parseDate(cand.ref.id)
def parseDate(date: Ident): Option[LocalDate] = def parseDate(date: Ident): Option[LocalDate] =
Either.catchNonFatal(LocalDate.parse(date.id)).toOption Either.catchNonFatal(LocalDate.parse(date.id)).toOption
case class Candidate(ref: IdRef, origin: Set[NerLabel]) case class Candidate(ref: IdRef, origin: Set[NerLabel], weight: Option[Double] = None) {
def withWeight(w: Double): Candidate =
copy(weight = Some(w))
}
object Candidate { object Candidate {
implicit val jsonEncoder: Encoder[Candidate] = implicit val jsonEncoder: Encoder[Candidate] =
deriveEncoder[Candidate] deriveEncoder[Candidate]
implicit val jsonDecoder: Decoder[Candidate] = implicit val jsonDecoder: Decoder[Candidate] =
deriveDecoder[Candidate] deriveDecoder[Candidate]
/** This deviates from standard order to sort None at last.
*/
val weightOrder: Order[Option[Double]] = new Order[Option[Double]] {
def compare(x: Option[Double], y: Option[Double]) =
(x, y) match {
case (None, None) => 0
case (None, _) => 1
case (_, None) => -1
case (Some(x), Some(y)) => Order[Double].compare(x, y)
}
}
} }
/** Merges candidates with same `IdRef' values and concatenates their
* respective labels. The candidate order is preserved.
*/
def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = { def flatten(s: NonEmptyList[Candidate]): NonEmptyList[Candidate] = {
def append(list: List[Candidate]): Candidate = def mergeInto(
list.reduce((l0, l1) => l0.copy(origin = l0.origin ++ l1.origin)) res: NonEmptyList[Candidate],
val grouped = s.toList.groupBy(_.ref.id) el: Candidate
NonEmptyList.fromListUnsafe(grouped.values.toList.map(append)) ): NonEmptyList[Candidate] = {
val l = res.map(c =>
if (c.ref.id == el.ref.id) c.copy(origin = c.origin ++ el.origin) else c
)
if (l == res) l :+ el
else l
}
val init = NonEmptyList.of(s.head)
s.tail.foldLeft(init)(mergeInto)
} }
implicit val jsonDecoder: Decoder[MetaProposal] = implicit val jsonDecoder: Decoder[MetaProposal] =
deriveDecoder[MetaProposal] deriveDecoder[MetaProposal]
implicit val jsonEncoder: Encoder[MetaProposal] = implicit val jsonEncoder: Encoder[MetaProposal] =
deriveEncoder[MetaProposal] deriveEncoder[MetaProposal]
} }

View File

@ -6,6 +6,10 @@ import docspell.common.MetaProposal.Candidate
import io.circe._ import io.circe._
import io.circe.generic.semiauto._ import io.circe.generic.semiauto._
/** A list of proposals for meta data to an item.
*
* The list usually keeps only one value for each `MetaProposalType'.
*/
case class MetaProposalList private (proposals: List[MetaProposal]) { case class MetaProposalList private (proposals: List[MetaProposal]) {
def isEmpty: Boolean = proposals.isEmpty def isEmpty: Boolean = proposals.isEmpty
@ -31,6 +35,14 @@ case class MetaProposalList private (proposals: List[MetaProposal]) {
def find(mpt: MetaProposalType): Option[MetaProposal] = def find(mpt: MetaProposalType): Option[MetaProposal] =
proposals.find(_.proposalType == mpt) proposals.find(_.proposalType == mpt)
def change(f: MetaProposal => MetaProposal): MetaProposalList =
new MetaProposalList(proposals.map(f))
def filter(f: MetaProposal => Boolean): MetaProposalList =
new MetaProposalList(proposals.filter(f))
def sortByWeights: MetaProposalList =
change(_.sortByWeight)
} }
object MetaProposalList { object MetaProposalList {
@ -54,6 +66,12 @@ object MetaProposalList {
def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList = def fromMap(m: Map[MetaProposalType, MetaProposal]): MetaProposalList =
new MetaProposalList(m.toList.map({ case (k, v) => v.copy(proposalType = k) })) new MetaProposalList(m.toList.map({ case (k, v) => v.copy(proposalType = k) }))
/** Flattens the given list of meta-proposals into a single list,
* where each meta-proposal type exists at most once. Candidates to
* equal proposal-types are merged together. The candidate's order
* is preserved and candidates of proposals are appended as given
* by the order of the given `seq'.
*/
def flatten(ml: Seq[MetaProposalList]): MetaProposalList = { def flatten(ml: Seq[MetaProposalList]): MetaProposalList = {
val init: Map[MetaProposalType, MetaProposal] = Map.empty val init: Map[MetaProposalType, MetaProposal] = Map.empty

View File

@ -0,0 +1,71 @@
package docspell.common
import minitest._
import cats.data.NonEmptyList
import docspell.common.MetaProposal.Candidate
object MetaProposalListTest extends SimpleTestSuite {
test("flatten retains order of candidates") {
val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty)
val mpl1 = MetaProposalList.of(
MetaProposal(
MetaProposalType.CorrOrg,
NonEmptyList.of(cand1)
)
)
val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty)
val mpl2 = MetaProposalList.of(
MetaProposal(
MetaProposalType.CorrOrg,
NonEmptyList.of(cand2)
)
)
val candidates1 = MetaProposalList
.flatten(Seq(mpl1, mpl2))
.find(MetaProposalType.CorrOrg)
.get
.values
assertEquals(candidates1.head, cand1)
assertEquals(candidates1.tail.head, cand2)
val candidates2 = MetaProposalList
.flatten(Seq(mpl2, mpl1))
.find(MetaProposalType.CorrOrg)
.get
.values
assertEquals(candidates2.head, cand2)
assertEquals(candidates2.tail.head, cand1)
}
test("sort by weights") {
val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty, Some(0.1))
val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty, Some(0.05))
val mpl = MetaProposalList
.of(
MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)),
MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2))
)
.sortByWeights
val candidates = mpl.find(MetaProposalType.CorrOrg).get.values
assertEquals(candidates.head, cand2)
assertEquals(candidates.tail.head, cand1)
}
test("sort by weights: unset is last") {
val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty, Some(0.1))
val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty)
val mpl = MetaProposalList
.of(
MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)),
MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2))
)
.sortByWeights
val candidates = mpl.find(MetaProposalType.CorrOrg).get.values
assertEquals(candidates.head, cand1)
assertEquals(candidates.tail.head, cand2)
}
}

View File

@ -73,7 +73,8 @@ object CreateItem {
fm, fm,
Vector.empty, Vector.empty,
Vector.empty, Vector.empty,
fm.map(a => a.id -> a.fileId).toMap fm.map(a => a.id -> a.fileId).toMap,
MetaProposalList.empty
) )
} }
@ -110,7 +111,7 @@ object CreateItem {
.map(originFileTuple) .map(originFileTuple)
.toMap .toMap
} yield cand.headOption.map(ri => } yield cand.headOption.map(ri =>
ItemData(ri, rms, Vector.empty, Vector.empty, origMap) ItemData(ri, rms, Vector.empty, Vector.empty, origMap, MetaProposalList.empty)
) )
} }

View File

@ -7,7 +7,8 @@ import docspell.common._
import docspell.joex.scheduler.Task import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta import docspell.store.records.RAttachmentMeta
/** Reorders the proposals to put most probable fits first. /** Calculate weights for candidates that adds the most likely
* candidate a lower number.
*/ */
object EvalProposals { object EvalProposals {
@ -16,24 +17,14 @@ object EvalProposals {
Timestamp Timestamp
.current[F] .current[F]
.map { now => .map { now =>
val metas = data.metas.map(reorderCandidates(now.toUtcDate)) val metas = data.metas.map(calcCandidateWeight(now.toUtcDate))
data.copy(metas = metas) data.copy(metas = metas)
} }
} }
def reorderCandidates(now: LocalDate)(rm: RAttachmentMeta): RAttachmentMeta = { def calcCandidateWeight(now: LocalDate)(rm: RAttachmentMeta): RAttachmentMeta = {
val list = rm.proposals.getTypes.toList val list = rm.proposals.change(mp => mp.addWeights(weight(rm, mp, now)))
.map(mpt => rm.copy(proposals = list.sortByWeights)
rm.proposals.find(mpt) match {
case Some(mp) =>
val v = mp.values.sortBy(weight(rm, mp, now))
Some(mp.copy(values = v))
case None =>
None
}
)
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
} }
def weight(rm: RAttachmentMeta, mp: MetaProposal, ref: LocalDate)( def weight(rm: RAttachmentMeta, mp: MetaProposal, ref: LocalDate)(

View File

@ -12,6 +12,7 @@ import docspell.joex.scheduler._
import docspell.store.records._ import docspell.store.records._
import docspell.files.Zip import docspell.files.Zip
import cats.kernel.Monoid import cats.kernel.Monoid
import emil.Mail
/** Goes through all attachments and extracts archive files, like zip /** Goes through all attachments and extracts archive files, like zip
* files. The process is recursive, until all archives have been * files. The process is recursive, until all archives have been
@ -56,7 +57,8 @@ object ExtractArchive {
_ <- naa.traverse(storeArchive(ctx)) _ <- naa.traverse(storeArchive(ctx))
} yield naa.headOption -> item.copy( } yield naa.headOption -> item.copy(
attachments = nra, attachments = nra,
originFile = item.originFile ++ nra.map(a => a.id -> a.fileId).toMap originFile = item.originFile ++ nra.map(a => a.id -> a.fileId).toMap,
givenMeta = item.givenMeta.fillEmptyFrom(Monoid[Extracted].combineAll(ras).meta)
) )
} }
@ -139,15 +141,27 @@ object ExtractArchive {
.through(ReadMail.bytesToMail[F](ctx.logger)) .through(ReadMail.bytesToMail[F](ctx.logger))
.flatMap { mail => .flatMap { mail =>
val mId = mail.header.messageId val mId = mail.header.messageId
val givenMeta =
for {
_ <- ctx.logger.debug(s"Use mail date for item date: ${mail.header.date}")
s <- Sync[F].delay(extractMailMeta(mail))
} yield s
ReadMail ReadMail
.mailToEntries(ctx.logger)(mail) .mailToEntries(ctx.logger)(mail)
.flatMap(handleEntry(ctx, ra, archive, mId)) .flatMap(handleEntry(ctx, ra, archive, mId)) ++ Stream.eval(givenMeta)
} }
.foldMonoid .foldMonoid
.compile .compile
.lastOrError .lastOrError
} }
def extractMailMeta[F[_]](mail: Mail[F]): Extracted =
mail.header.date
.map(Timestamp.apply)
.map(ts => Extracted.empty.setMeta(MetaProposal.docDate(ts, None)))
.getOrElse(Extracted.empty)
def handleEntry[F[_]: Sync]( def handleEntry[F[_]: Sync](
ctx: Context[F, _], ctx: Context[F, _],
ra: RAttachment, ra: RAttachment,
@ -187,18 +201,28 @@ object ExtractArchive {
def storeArchive[F[_]: Sync](ctx: Context[F, _])(aa: RAttachmentArchive): F[Int] = def storeArchive[F[_]: Sync](ctx: Context[F, _])(aa: RAttachmentArchive): F[Int] =
ctx.store.transact(RAttachmentArchive.insert(aa)) ctx.store.transact(RAttachmentArchive.insert(aa))
case class Extracted(files: Vector[RAttachment], archives: Vector[RAttachmentArchive]) { case class Extracted(
files: Vector[RAttachment],
archives: Vector[RAttachmentArchive],
meta: MetaProposalList
) {
def ++(e: Extracted) = def ++(e: Extracted) =
Extracted(files ++ e.files, archives ++ e.archives) Extracted(files ++ e.files, archives ++ e.archives, meta.fillEmptyFrom(e.meta))
def setMeta(m: MetaProposal): Extracted =
setMeta(MetaProposalList.of(m))
def setMeta(ml: MetaProposalList): Extracted =
Extracted(files, archives, meta.fillEmptyFrom(ml))
} }
object Extracted { object Extracted {
val empty = Extracted(Vector.empty, Vector.empty) val empty = Extracted(Vector.empty, Vector.empty, MetaProposalList.empty)
def noArchive(ra: RAttachment): Extracted = def noArchive(ra: RAttachment): Extracted =
Extracted(Vector(ra), Vector.empty) Extracted(Vector(ra), Vector.empty, MetaProposalList.empty)
def of(ra: RAttachment, aa: RAttachmentArchive): Extracted = def of(ra: RAttachment, aa: RAttachmentArchive): Extracted =
Extracted(Vector(ra), Vector(aa)) Extracted(Vector(ra), Vector(aa), MetaProposalList.empty)
implicit val extractedMonoid: Monoid[Extracted] = implicit val extractedMonoid: Monoid[Extracted] =
Monoid.instance(empty, _ ++ _) Monoid.instance(empty, _ ++ _)

View File

@ -1,15 +1,28 @@
package docspell.joex.process package docspell.joex.process
import docspell.common.{Ident, NerDateLabel, NerLabel} import docspell.common._
import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.process.ItemData.AttachmentDates
import docspell.store.records.{RAttachment, RAttachmentMeta, RItem} import docspell.store.records.{RAttachment, RAttachmentMeta, RItem}
/** Data that is carried across all processing tasks.
*
* @param item the stored item record
* @param attachments the attachments belonging to the item
* @param metas the meta data to each attachment; depending on the
* state of processing, this may be empty
* @param dateLabels a separate list of found dates
* @param originFile a mapping from an attachment id to a filemeta-id
* containng the source or origin file
* @param givenMeta meta data to this item that was not "guessed"
* from an attachment but given and thus is always correct
*/
case class ItemData( case class ItemData(
item: RItem, item: RItem,
attachments: Vector[RAttachment], attachments: Vector[RAttachment],
metas: Vector[RAttachmentMeta], metas: Vector[RAttachmentMeta],
dateLabels: Vector[AttachmentDates], dateLabels: Vector[AttachmentDates],
originFile: Map[Ident, Ident] //maps RAttachment.id -> FileMeta.id originFile: Map[Ident, Ident], // maps RAttachment.id -> FileMeta.id
givenMeta: MetaProposalList // given meta data not associated to a specific attachment
) { ) {
def findMeta(attachId: Ident): Option[RAttachmentMeta] = def findMeta(attachId: Ident): Option[RAttachmentMeta] =

View File

@ -10,7 +10,13 @@ object LinkProposal {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx => Task { ctx =>
val proposals = MetaProposalList.flatten(data.metas.map(_.proposals)) // sort by weight; order of equal weights is not important, just
// choose one others are then suggestions
// doc-date is only set when given explicitely, not from "guessing"
val proposals = MetaProposalList
.flatten(data.metas.map(_.proposals))
.filter(_.proposalType != MetaProposalType.DocDate)
.sortByWeights
ctx.logger.info(s"Starting linking proposals") *> ctx.logger.info(s"Starting linking proposals") *>
MetaProposalType.all MetaProposalType.all
@ -24,8 +30,9 @@ object LinkProposal {
proposalList: MetaProposalList, proposalList: MetaProposalList,
ctx: Context[F, ProcessItemArgs] ctx: Context[F, ProcessItemArgs]
)(mpt: MetaProposalType): F[Result] = )(mpt: MetaProposalType): F[Result] =
proposalList.find(mpt) match { data.givenMeta.find(mpt).orElse(proposalList.find(mpt)) match {
case None => case None =>
ctx.logger.debug(s"No value for $mpt") *>
Result.noneFound(mpt).pure[F] Result.noneFound(mpt).pure[F]
case Some(a) if a.isSingleValue => case Some(a) if a.isSingleValue =>
ctx.logger.info(s"Found one candidate for ${a.proposalType}") *> ctx.logger.info(s"Found one candidate for ${a.proposalType}") *>
@ -69,7 +76,17 @@ object LinkProposal {
RItem.updateConcEquip(itemId, ctx.args.meta.collective, Some(value)) RItem.updateConcEquip(itemId, ctx.args.meta.collective, Some(value))
) )
case MetaProposalType.DocDate => case MetaProposalType.DocDate =>
ctx.logger.debug(s"Not linking document date suggestion ${value.id}").map(_ => 0) MetaProposal.parseDate(value) match {
case Some(ld) =>
val ts = Timestamp.from(ld.atStartOfDay(Timestamp.UTC))
ctx.logger.debug(s"Updating item date ${value.id}") *>
ctx.store.transact(
RItem.updateDate(itemId, ctx.args.meta.collective, Some(ts))
)
case None =>
ctx.logger.info(s"Cannot read value '${value.id}' into a date.") *>
0.pure[F]
}
case MetaProposalType.DueDate => case MetaProposalType.DueDate =>
MetaProposal.parseDate(value) match { MetaProposal.parseDate(value) match {
case Some(ld) => case Some(ld) =>