From a70e9ab614e2a52684f754d4d563c1d3b45e5274 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Fri, 15 Jan 2021 23:30:49 +0100 Subject: [PATCH] Store used language for processing on attachmentmeta Issue: #570 --- .../joex/process/AttachmentPageCount.scala | 9 ++++- .../docspell/joex/process/ConvertPdf.scala | 13 ++++++- .../docspell/joex/process/ItemData.scala | 15 +++++--- .../joex/process/TextExtraction.scala | 3 +- .../migration/h2/V1.17.0__meta_language.sql | 35 +++++++++++++++++++ .../mariadb/V1.17.0__meta_language.sql | 14 ++++++++ .../postgresql/V1.17.0__meta_language.sql | 15 ++++++++ .../docspell/store/queries/QAttachment.scala | 10 +++++- .../store/records/RAttachmentMeta.scala | 13 ++++--- 9 files changed, 113 insertions(+), 14 deletions(-) create mode 100644 modules/store/src/main/resources/db/migration/h2/V1.17.0__meta_language.sql create mode 100644 modules/store/src/main/resources/db/migration/mariadb/V1.17.0__meta_language.sql create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.17.0__meta_language.sql diff --git a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala index f3cf7b0e..0373db8a 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/AttachmentPageCount.scala @@ -78,7 +78,14 @@ object AttachmentPageCount { s"No attachmentmeta record exists for ${ra.id.id}. Creating new." ) *> ctx.store.transact( RAttachmentMeta.insert( - RAttachmentMeta(ra.id, None, Nil, MetaProposalList.empty, md.pageCount.some) + RAttachmentMeta( + ra.id, + None, + Nil, + MetaProposalList.empty, + md.pageCount.some, + None + ) ) ) else 0.pure[F] diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala index 65ff0dda..56c27666 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -108,7 +108,18 @@ object ConvertPdf { ctx.logger.info(s"Conversion to pdf+txt successful. Saving file.") *> storePDF(ctx, cfg, ra, pdf) .flatMap(r => - txt.map(t => (r, item.changeMeta(ra.id, _.setContentIfEmpty(t.some)).some)) + txt.map(t => + ( + r, + item + .changeMeta( + ra.id, + ctx.args.meta.language, + _.setContentIfEmpty(t.some) + ) + .some + ) + ) ) case ConversionResult.UnsupportedFormat(mt) => diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index af9a3db2..0435e37c 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -32,8 +32,12 @@ case class ItemData( def findDates(rm: RAttachmentMeta): Vector[NerDateLabel] = dateLabels.find(m => m.rm.id == rm.id).map(_.dates).getOrElse(Vector.empty) - def mapMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): ItemData = { - val item = changeMeta(attachId, f) + def mapMeta( + attachId: Ident, + lang: Language, + f: RAttachmentMeta => RAttachmentMeta + ): ItemData = { + val item = changeMeta(attachId, lang, f) val next = metas.map(a => if (a.id == attachId) item else a) copy(metas = next) } @@ -43,13 +47,14 @@ case class ItemData( def changeMeta( attachId: Ident, + lang: Language, f: RAttachmentMeta => RAttachmentMeta ): RAttachmentMeta = - f(findOrCreate(attachId)) + f(findOrCreate(attachId, lang)) - def findOrCreate(attachId: Ident): RAttachmentMeta = + def findOrCreate(attachId: Ident, lang: Language): RAttachmentMeta = metas.find(_.id == attachId).getOrElse { - RAttachmentMeta.empty(attachId) + RAttachmentMeta.empty(attachId, lang) } } diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index db2988b8..fee7d323 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -78,7 +78,7 @@ object TextExtraction { pair._2 ) - val rm = item.findOrCreate(ra.id) + val rm = item.findOrCreate(ra.id, lang) rm.content match { case Some(_) => ctx.logger.info("TextExtraction skipped, since text is already available.") *> @@ -102,6 +102,7 @@ object TextExtraction { res <- extractTextFallback(ctx, cfg, ra, lang)(fids) meta = item.changeMeta( ra.id, + lang, rm => rm.setContentIfEmpty( res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty) diff --git a/modules/store/src/main/resources/db/migration/h2/V1.17.0__meta_language.sql b/modules/store/src/main/resources/db/migration/h2/V1.17.0__meta_language.sql new file mode 100644 index 00000000..35004e08 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/h2/V1.17.0__meta_language.sql @@ -0,0 +1,35 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "language" varchar(254); + +update "attachmentmeta" +set "language" = 'deu' +where "attachid" in ( + select "m"."attachid" + from "attachmentmeta" m + inner join "attachment" a on "a"."attachid" = "m"."attachid" + inner join "item" i on "a"."itemid" = "i"."itemid" + inner join "collective" c on "c"."cid" = "i"."cid" + where "c"."doclang" = 'deu' +); + +update "attachmentmeta" +set "language" = 'eng' +where "attachid" in ( + select "m"."attachid" + from "attachmentmeta" m + inner join "attachment" a on "a"."attachid" = "m"."attachid" + inner join "item" i on "a"."itemid" = "i"."itemid" + inner join "collective" c on "c"."cid" = "i"."cid" + where "c"."doclang" = 'eng' +); + +update "attachmentmeta" +set "language" = 'fra' +where "attachid" in ( + select "m"."attachid" + from "attachmentmeta" m + inner join "attachment" a on "a"."attachid" = "m"."attachid" + inner join "item" i on "a"."itemid" = "i"."itemid" + inner join "collective" c on "c"."cid" = "i"."cid" + where "c"."doclang" = 'fra' +); diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.17.0__meta_language.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.17.0__meta_language.sql new file mode 100644 index 00000000..bd12e732 --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.17.0__meta_language.sql @@ -0,0 +1,14 @@ +ALTER TABLE `attachmentmeta` +ADD COLUMN (`language` varchar(254)); + +update `attachmentmeta` `m` +inner join ( + select `m`.`attachid`, `c`.`doclang` + from `attachmentmeta` m + inner join `attachment` a on `a`.`attachid` = `m`.`attachid` + inner join `item` i on `a`.`itemid` = `i`.`itemid` + inner join `collective` c on `c`.`cid` = `i`.`cid` + ) as `c` +set `m`.`language` = `c`.`doclang` +where `m`.`attachid` = `c`.`attachid` and `m`.`language` is null; + diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.17.0__meta_language.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.17.0__meta_language.sql new file mode 100644 index 00000000..ba84fc2a --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.17.0__meta_language.sql @@ -0,0 +1,15 @@ +ALTER TABLE "attachmentmeta" +ADD COLUMN "language" varchar(254); + +with + "attachlang" as ( + select "m"."attachid", "m"."language", "c"."doclang" + from "attachmentmeta" m + inner join "attachment" a on "a"."attachid" = "m"."attachid" + inner join "item" i on "a"."itemid" = "i"."itemid" + inner join "collective" c on "c"."cid" = "i"."cid" + ) +update "attachmentmeta" as "m" +set "language" = "c"."doclang" +from "attachlang" c +where "m"."attachid" = "c"."attachid" and "m"."language" is null; diff --git a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala index 6ac9327a..a9afc0bf 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala @@ -160,7 +160,15 @@ object QAttachment { chunkSize: Int ): Stream[ConnectionIO, ContentAndName] = Select( - select(a.id, a.itemId, item.cid, item.folder, c.language, a.name, am.content), + select( + a.id.s, + a.itemId.s, + item.cid.s, + item.folder.s, + coalesce(am.language.s, c.language.s).s, + a.name.s, + am.content.s + ), from(a) .innerJoin(am, am.id === a.id) .innerJoin(item, item.id === a.itemId) diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala index 4adfbad7..919a5b17 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala @@ -15,7 +15,8 @@ case class RAttachmentMeta( content: Option[String], nerlabels: List[NerLabel], proposals: MetaProposalList, - pages: Option[Int] + pages: Option[Int], + language: Option[Language] ) { def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = @@ -27,8 +28,8 @@ case class RAttachmentMeta( } object RAttachmentMeta { - def empty(attachId: Ident) = - RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None) + def empty(attachId: Ident, lang: Language) = + RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang)) final case class Table(alias: Option[String]) extends TableDef { val tableName = "attachmentmeta" @@ -38,7 +39,9 @@ object RAttachmentMeta { val nerlabels = Column[List[NerLabel]]("nerlabels", this) val proposals = Column[MetaProposalList]("itemproposals", this) val pages = Column[Int]("page_count", this) - val all = NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages) + val language = Column[Language]("language", this) + val all = + NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language) } val T = Table(None) @@ -49,7 +52,7 @@ object RAttachmentMeta { DML.insert( T, T.all, - fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages}" + fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}" ) def exists(attachId: Ident): ConnectionIO[Boolean] =