Store used language for processing on attachmentmeta

Issue: #570
This commit is contained in:
Eike Kettner 2021-01-15 23:30:49 +01:00
parent 6cf3f9be5a
commit a70e9ab614
9 changed files with 113 additions and 14 deletions

View File

@ -78,7 +78,14 @@ object AttachmentPageCount {
s"No attachmentmeta record exists for ${ra.id.id}. Creating new."
) *> ctx.store.transact(
RAttachmentMeta.insert(
RAttachmentMeta(ra.id, None, Nil, MetaProposalList.empty, md.pageCount.some)
RAttachmentMeta(
ra.id,
None,
Nil,
MetaProposalList.empty,
md.pageCount.some,
None
)
)
)
else 0.pure[F]

View File

@ -108,7 +108,18 @@ object ConvertPdf {
ctx.logger.info(s"Conversion to pdf+txt successful. Saving file.") *>
storePDF(ctx, cfg, ra, pdf)
.flatMap(r =>
txt.map(t => (r, item.changeMeta(ra.id, _.setContentIfEmpty(t.some)).some))
txt.map(t =>
(
r,
item
.changeMeta(
ra.id,
ctx.args.meta.language,
_.setContentIfEmpty(t.some)
)
.some
)
)
)
case ConversionResult.UnsupportedFormat(mt) =>

View File

@ -32,8 +32,12 @@ case class ItemData(
def findDates(rm: RAttachmentMeta): Vector[NerDateLabel] =
dateLabels.find(m => m.rm.id == rm.id).map(_.dates).getOrElse(Vector.empty)
def mapMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): ItemData = {
val item = changeMeta(attachId, f)
def mapMeta(
attachId: Ident,
lang: Language,
f: RAttachmentMeta => RAttachmentMeta
): ItemData = {
val item = changeMeta(attachId, lang, f)
val next = metas.map(a => if (a.id == attachId) item else a)
copy(metas = next)
}
@ -43,13 +47,14 @@ case class ItemData(
def changeMeta(
attachId: Ident,
lang: Language,
f: RAttachmentMeta => RAttachmentMeta
): RAttachmentMeta =
f(findOrCreate(attachId))
f(findOrCreate(attachId, lang))
def findOrCreate(attachId: Ident): RAttachmentMeta =
def findOrCreate(attachId: Ident, lang: Language): RAttachmentMeta =
metas.find(_.id == attachId).getOrElse {
RAttachmentMeta.empty(attachId)
RAttachmentMeta.empty(attachId, lang)
}
}

View File

@ -78,7 +78,7 @@ object TextExtraction {
pair._2
)
val rm = item.findOrCreate(ra.id)
val rm = item.findOrCreate(ra.id, lang)
rm.content match {
case Some(_) =>
ctx.logger.info("TextExtraction skipped, since text is already available.") *>
@ -102,6 +102,7 @@ object TextExtraction {
res <- extractTextFallback(ctx, cfg, ra, lang)(fids)
meta = item.changeMeta(
ra.id,
lang,
rm =>
rm.setContentIfEmpty(
res.map(_.appendPdfMetaToText.text.trim).filter(_.nonEmpty)

View File

@ -0,0 +1,35 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "language" varchar(254);
update "attachmentmeta"
set "language" = 'deu'
where "attachid" in (
select "m"."attachid"
from "attachmentmeta" m
inner join "attachment" a on "a"."attachid" = "m"."attachid"
inner join "item" i on "a"."itemid" = "i"."itemid"
inner join "collective" c on "c"."cid" = "i"."cid"
where "c"."doclang" = 'deu'
);
update "attachmentmeta"
set "language" = 'eng'
where "attachid" in (
select "m"."attachid"
from "attachmentmeta" m
inner join "attachment" a on "a"."attachid" = "m"."attachid"
inner join "item" i on "a"."itemid" = "i"."itemid"
inner join "collective" c on "c"."cid" = "i"."cid"
where "c"."doclang" = 'eng'
);
update "attachmentmeta"
set "language" = 'fra'
where "attachid" in (
select "m"."attachid"
from "attachmentmeta" m
inner join "attachment" a on "a"."attachid" = "m"."attachid"
inner join "item" i on "a"."itemid" = "i"."itemid"
inner join "collective" c on "c"."cid" = "i"."cid"
where "c"."doclang" = 'fra'
);

View File

@ -0,0 +1,14 @@
ALTER TABLE `attachmentmeta`
ADD COLUMN (`language` varchar(254));
update `attachmentmeta` `m`
inner join (
select `m`.`attachid`, `c`.`doclang`
from `attachmentmeta` m
inner join `attachment` a on `a`.`attachid` = `m`.`attachid`
inner join `item` i on `a`.`itemid` = `i`.`itemid`
inner join `collective` c on `c`.`cid` = `i`.`cid`
) as `c`
set `m`.`language` = `c`.`doclang`
where `m`.`attachid` = `c`.`attachid` and `m`.`language` is null;

View File

@ -0,0 +1,15 @@
ALTER TABLE "attachmentmeta"
ADD COLUMN "language" varchar(254);
with
"attachlang" as (
select "m"."attachid", "m"."language", "c"."doclang"
from "attachmentmeta" m
inner join "attachment" a on "a"."attachid" = "m"."attachid"
inner join "item" i on "a"."itemid" = "i"."itemid"
inner join "collective" c on "c"."cid" = "i"."cid"
)
update "attachmentmeta" as "m"
set "language" = "c"."doclang"
from "attachlang" c
where "m"."attachid" = "c"."attachid" and "m"."language" is null;

View File

@ -160,7 +160,15 @@ object QAttachment {
chunkSize: Int
): Stream[ConnectionIO, ContentAndName] =
Select(
select(a.id, a.itemId, item.cid, item.folder, c.language, a.name, am.content),
select(
a.id.s,
a.itemId.s,
item.cid.s,
item.folder.s,
coalesce(am.language.s, c.language.s).s,
a.name.s,
am.content.s
),
from(a)
.innerJoin(am, am.id === a.id)
.innerJoin(item, item.id === a.itemId)

View File

@ -15,7 +15,8 @@ case class RAttachmentMeta(
content: Option[String],
nerlabels: List[NerLabel],
proposals: MetaProposalList,
pages: Option[Int]
pages: Option[Int],
language: Option[Language]
) {
def setContentIfEmpty(txt: Option[String]): RAttachmentMeta =
@ -27,8 +28,8 @@ case class RAttachmentMeta(
}
object RAttachmentMeta {
def empty(attachId: Ident) =
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None)
def empty(attachId: Ident, lang: Language) =
RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty, None, Some(lang))
final case class Table(alias: Option[String]) extends TableDef {
val tableName = "attachmentmeta"
@ -38,7 +39,9 @@ object RAttachmentMeta {
val nerlabels = Column[List[NerLabel]]("nerlabels", this)
val proposals = Column[MetaProposalList]("itemproposals", this)
val pages = Column[Int]("page_count", this)
val all = NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages)
val language = Column[Language]("language", this)
val all =
NonEmptyList.of[Column[_]](id, content, nerlabels, proposals, pages, language)
}
val T = Table(None)
@ -49,7 +52,7 @@ object RAttachmentMeta {
DML.insert(
T,
T.all,
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages}"
fr"${v.id},${v.content},${v.nerlabels},${v.proposals},${v.pages},${v.language}"
)
def exists(attachId: Ident): ConnectionIO[Boolean] =