From d65c1e0d36cd8fa9c8ae0a97f1ad65b79ef86b99 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Sat, 16 May 2020 14:18:59 +0200 Subject: [PATCH] Use date from e-mails to set item date --- .../scala/docspell/common/MetaProposal.scala | 11 ++++++ .../docspell/common/MetaProposalList.scala | 3 ++ .../common/MetaProposalListTest.scala | 20 ++++++---- .../joex/process/ExtractArchive.scala | 38 +++++++++++++++---- .../docspell/joex/process/ItemData.scala | 10 +++++ .../docspell/joex/process/LinkProposal.scala | 21 ++++++++-- 6 files changed, 85 insertions(+), 18 deletions(-) diff --git a/modules/common/src/main/scala/docspell/common/MetaProposal.scala b/modules/common/src/main/scala/docspell/common/MetaProposal.scala index d3d40012..d1e236ec 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposal.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposal.scala @@ -43,6 +43,17 @@ case class MetaProposal(proposalType: MetaProposalType, values: NonEmptyList[Can object MetaProposal { + def apply(pt: MetaProposalType, v0: Candidate, vm: Candidate*): MetaProposal = + MetaProposal(pt, NonEmptyList.of(v0, vm: _*)) + + def docDate(ts: Timestamp, origin: Option[NerLabel]): MetaProposal = { + val label = ts.toUtcDate.toString + MetaProposal( + MetaProposalType.DocDate, + Candidate(IdRef(Ident.unsafe(label), label), origin.toSet) + ) + } + def parseDate(cand: Candidate): Option[LocalDate] = parseDate(cand.ref.id) diff --git a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala index d72f5f85..4b62b686 100644 --- a/modules/common/src/main/scala/docspell/common/MetaProposalList.scala +++ b/modules/common/src/main/scala/docspell/common/MetaProposalList.scala @@ -38,6 +38,9 @@ case class MetaProposalList private (proposals: List[MetaProposal]) { def change(f: MetaProposal => MetaProposal): MetaProposalList = new MetaProposalList(proposals.map(f)) + def filter(f: MetaProposal => Boolean): MetaProposalList = + new MetaProposalList(proposals.filter(f)) + def sortByWeights: MetaProposalList = change(_.sortByWeight) } diff --git a/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala b/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala index a8ffde51..4b652f62 100644 --- a/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala +++ b/modules/common/src/test/scala/docspell/common/MetaProposalListTest.scala @@ -42,10 +42,12 @@ object MetaProposalListTest extends SimpleTestSuite { test("sort by weights") { val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty, Some(0.1)) val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty, Some(0.05)) - val mpl = MetaProposalList.of( - MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)), - MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2)) - ).sortByWeights + val mpl = MetaProposalList + .of( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)), + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2)) + ) + .sortByWeights val candidates = mpl.find(MetaProposalType.CorrOrg).get.values assertEquals(candidates.head, cand2) @@ -55,10 +57,12 @@ object MetaProposalListTest extends SimpleTestSuite { test("sort by weights: unset is last") { val cand1 = Candidate(IdRef(Ident.unsafe("123"), "name"), Set.empty, Some(0.1)) val cand2 = Candidate(IdRef(Ident.unsafe("456"), "name"), Set.empty) - val mpl = MetaProposalList.of( - MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)), - MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2)) - ).sortByWeights + val mpl = MetaProposalList + .of( + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand1)), + MetaProposal(MetaProposalType.CorrOrg, NonEmptyList.of(cand2)) + ) + .sortByWeights val candidates = mpl.find(MetaProposalType.CorrOrg).get.values assertEquals(candidates.head, cand1) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala index 4429203b..ddb184ab 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ExtractArchive.scala @@ -12,6 +12,7 @@ import docspell.joex.scheduler._ import docspell.store.records._ import docspell.files.Zip import cats.kernel.Monoid +import emil.Mail /** Goes through all attachments and extracts archive files, like zip * files. The process is recursive, until all archives have been @@ -56,7 +57,8 @@ object ExtractArchive { _ <- naa.traverse(storeArchive(ctx)) } yield naa.headOption -> item.copy( attachments = nra, - originFile = item.originFile ++ nra.map(a => a.id -> a.fileId).toMap + originFile = item.originFile ++ nra.map(a => a.id -> a.fileId).toMap, + givenMeta = item.givenMeta.fillEmptyFrom(Monoid[Extracted].combineAll(ras).meta) ) } @@ -139,15 +141,27 @@ object ExtractArchive { .through(ReadMail.bytesToMail[F](ctx.logger)) .flatMap { mail => val mId = mail.header.messageId + val givenMeta = + for { + _ <- ctx.logger.debug(s"Use mail date for item date: ${mail.header.date}") + s <- Sync[F].delay(extractMailMeta(mail)) + } yield s + ReadMail .mailToEntries(ctx.logger)(mail) - .flatMap(handleEntry(ctx, ra, archive, mId)) + .flatMap(handleEntry(ctx, ra, archive, mId)) ++ Stream.eval(givenMeta) } .foldMonoid .compile .lastOrError } + def extractMailMeta[F[_]](mail: Mail[F]): Extracted = + mail.header.date + .map(Timestamp.apply) + .map(ts => Extracted.empty.setMeta(MetaProposal.docDate(ts, None))) + .getOrElse(Extracted.empty) + def handleEntry[F[_]: Sync]( ctx: Context[F, _], ra: RAttachment, @@ -187,18 +201,28 @@ object ExtractArchive { def storeArchive[F[_]: Sync](ctx: Context[F, _])(aa: RAttachmentArchive): F[Int] = ctx.store.transact(RAttachmentArchive.insert(aa)) - case class Extracted(files: Vector[RAttachment], archives: Vector[RAttachmentArchive]) { + case class Extracted( + files: Vector[RAttachment], + archives: Vector[RAttachmentArchive], + meta: MetaProposalList + ) { def ++(e: Extracted) = - Extracted(files ++ e.files, archives ++ e.archives) + Extracted(files ++ e.files, archives ++ e.archives, meta.fillEmptyFrom(e.meta)) + + def setMeta(m: MetaProposal): Extracted = + setMeta(MetaProposalList.of(m)) + + def setMeta(ml: MetaProposalList): Extracted = + Extracted(files, archives, meta.fillEmptyFrom(ml)) } object Extracted { - val empty = Extracted(Vector.empty, Vector.empty) + val empty = Extracted(Vector.empty, Vector.empty, MetaProposalList.empty) def noArchive(ra: RAttachment): Extracted = - Extracted(Vector(ra), Vector.empty) + Extracted(Vector(ra), Vector.empty, MetaProposalList.empty) def of(ra: RAttachment, aa: RAttachmentArchive): Extracted = - Extracted(Vector(ra), Vector(aa)) + Extracted(Vector(ra), Vector(aa), MetaProposalList.empty) implicit val extractedMonoid: Monoid[Extracted] = Monoid.instance(empty, _ ++ _) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index 7d8e7729..46ef9f8c 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -5,6 +5,16 @@ import docspell.joex.process.ItemData.AttachmentDates import docspell.store.records.{RAttachment, RAttachmentMeta, RItem} /** Data that is carried across all processing tasks. + * + * @param item the stored item record + * @param attachments the attachments belonging to the item + * @param metas the meta data to each attachment; depending on the + * state of processing, this may be empty + * @param dateLabels a separate list of found dates + * @param originFile a mapping from an attachment id to a filemeta-id + * containng the source or origin file + * @param givenMeta meta data to this item that was not "guessed" + * from an attachment but given and thus is always correct */ case class ItemData( item: RItem, diff --git a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala index ca875215..7552b8db 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/LinkProposal.scala @@ -12,7 +12,11 @@ object LinkProposal { Task { ctx => // sort by weight; order of equal weights is not important, just // choose one others are then suggestions - val proposals = MetaProposalList.flatten(data.metas.map(_.proposals)).sortByWeights + // doc-date is only set when given explicitely, not from "guessing" + val proposals = MetaProposalList + .flatten(data.metas.map(_.proposals)) + .filter(_.proposalType != MetaProposalType.DocDate) + .sortByWeights ctx.logger.info(s"Starting linking proposals") *> MetaProposalType.all @@ -28,7 +32,8 @@ object LinkProposal { )(mpt: MetaProposalType): F[Result] = data.givenMeta.find(mpt).orElse(proposalList.find(mpt)) match { case None => - Result.noneFound(mpt).pure[F] + ctx.logger.debug(s"No value for $mpt") *> + Result.noneFound(mpt).pure[F] case Some(a) if a.isSingleValue => ctx.logger.info(s"Found one candidate for ${a.proposalType}") *> setItemMeta(data.item.id, ctx, a.proposalType, a.values.head.ref.id).map(_ => @@ -71,7 +76,17 @@ object LinkProposal { RItem.updateConcEquip(itemId, ctx.args.meta.collective, Some(value)) ) case MetaProposalType.DocDate => - ctx.logger.debug(s"Not linking document date suggestion ${value.id}").map(_ => 0) + MetaProposal.parseDate(value) match { + case Some(ld) => + val ts = Timestamp.from(ld.atStartOfDay(Timestamp.UTC)) + ctx.logger.debug(s"Updating item date ${value.id}") *> + ctx.store.transact( + RItem.updateDate(itemId, ctx.args.meta.collective, Some(ts)) + ) + case None => + ctx.logger.info(s"Cannot read value '${value.id}' into a date.") *> + 0.pure[F] + } case MetaProposalType.DueDate => MetaProposal.parseDate(value) match { case Some(ld) =>