Search archives when looking for files via checksum

This commit is contained in:
Eike Kettner 2020-03-19 22:42:48 +01:00
parent 6b1156182c
commit 439aaee27b
2 changed files with 69 additions and 23 deletions

View File

@ -1,5 +1,22 @@
# Changelog # Changelog
## v0.4.0
*unknown*
- Support for archive files. Archives are files that contain other
files, like zip files. Docspell now extracts archives and adds the
content to an item. The extraction process is recursive, so there
may be zip files in zip files. File types supported:
- `zip` every file inside is added to one item as attachment
- `eml` (RCF822 E-Mail files) E-mails are considered archives, since
they may contain multiple files (body and attachments).
- Periodic Tasks framework: Docspell can now run tasks periodically
based on a schedule. This is not yet exposed to the user, but there
are some system cleanup jobs to start with.
- Improvement of the text analysis. For my test files there was a
increase in accuracy by about 10%.
## v0.3.0 ## v0.3.0
*Mar. 1, 2020* *Mar. 1, 2020*

View File

@ -40,7 +40,11 @@ object QItem {
val ICC = List(RItem.Columns.id, RItem.Columns.name).map(_.prefix("ref")) val ICC = List(RItem.Columns.id, RItem.Columns.name).map(_.prefix("ref"))
val cq = val cq =
selectSimple(IC ++ OC ++ P0C ++ P1C ++ EC ++ ICC, RItem.table ++ fr"i", Fragment.empty) ++ selectSimple(
IC ++ OC ++ P0C ++ P1C ++ EC ++ ICC,
RItem.table ++ fr"i",
Fragment.empty
) ++
fr"LEFT JOIN" ++ ROrganization.table ++ fr"o ON" ++ RItem.Columns.corrOrg fr"LEFT JOIN" ++ ROrganization.table ++ fr"o ON" ++ RItem.Columns.corrOrg
.prefix("i") .prefix("i")
.is(ROrganization.Columns.oid.prefix("o")) ++ .is(ROrganization.Columns.oid.prefix("o")) ++
@ -179,7 +183,11 @@ object QItem {
// inclusive tags are AND-ed // inclusive tags are AND-ed
val tagSelectsIncl = q.tagsInclude val tagSelectsIncl = q.tagsInclude
.map(tid => .map(tid =>
selectSimple(List(RTagItem.Columns.itemId), RTagItem.table, RTagItem.Columns.tagId.is(tid)) selectSimple(
List(RTagItem.Columns.itemId),
RTagItem.table,
RTagItem.Columns.tagId.is(tid)
)
) )
.map(f => sql"(" ++ f ++ sql") ") .map(f => sql"(" ++ f ++ sql") ")
@ -207,21 +215,28 @@ object QItem {
REquipment.Columns.eid.prefix("e1").isOrDiscard(q.concEquip), REquipment.Columns.eid.prefix("e1").isOrDiscard(q.concEquip),
if (q.tagsInclude.isEmpty) Fragment.empty if (q.tagsInclude.isEmpty) Fragment.empty
else else
IC.id.prefix("i") ++ sql" IN (" ++ tagSelectsIncl.reduce(_ ++ fr"INTERSECT" ++ _) ++ sql")", IC.id.prefix("i") ++ sql" IN (" ++ tagSelectsIncl
.reduce(_ ++ fr"INTERSECT" ++ _) ++ sql")",
if (q.tagsExclude.isEmpty) Fragment.empty if (q.tagsExclude.isEmpty) Fragment.empty
else IC.id.prefix("i").f ++ sql" NOT IN (" ++ tagSelectsExcl ++ sql")", else IC.id.prefix("i").f ++ sql" NOT IN (" ++ tagSelectsExcl ++ sql")",
q.dateFrom q.dateFrom
.map(d => coalesce(IC.itemDate.prefix("i").f, IC.created.prefix("i").f) ++ fr">= $d") .map(d =>
coalesce(IC.itemDate.prefix("i").f, IC.created.prefix("i").f) ++ fr">= $d"
)
.getOrElse(Fragment.empty), .getOrElse(Fragment.empty),
q.dateTo q.dateTo
.map(d => coalesce(IC.itemDate.prefix("i").f, IC.created.prefix("i").f) ++ fr"<= $d") .map(d =>
coalesce(IC.itemDate.prefix("i").f, IC.created.prefix("i").f) ++ fr"<= $d"
)
.getOrElse(Fragment.empty), .getOrElse(Fragment.empty),
q.dueDateFrom.map(d => IC.dueDate.prefix("i").isGt(d)).getOrElse(Fragment.empty), q.dueDateFrom.map(d => IC.dueDate.prefix("i").isGt(d)).getOrElse(Fragment.empty),
q.dueDateTo.map(d => IC.dueDate.prefix("i").isLt(d)).getOrElse(Fragment.empty) q.dueDateTo.map(d => IC.dueDate.prefix("i").isLt(d)).getOrElse(Fragment.empty)
) )
val order = orderBy(coalesce(IC.itemDate.prefix("i").f, IC.created.prefix("i").f) ++ fr"DESC") val order = orderBy(
val frag = query ++ fr"WHERE" ++ cond ++ order coalesce(IC.itemDate.prefix("i").f, IC.created.prefix("i").f) ++ fr"DESC"
)
val frag = query ++ fr"WHERE" ++ cond ++ order
logger.trace(s"List items: $frag") logger.trace(s"List items: $frag")
frag.query[ListItem].stream frag.query[ListItem].stream
} }
@ -247,25 +262,39 @@ object QItem {
} }
def findByChecksum(checksum: String, collective: Ident): ConnectionIO[Vector[RItem]] = { def findByChecksum(checksum: String, collective: Ident): ConnectionIO[Vector[RItem]] = {
val IC = RItem.Columns.all.map(_.prefix("i")) val IC = RItem.Columns.all.map(_.prefix("i"))
val aItem = RAttachment.Columns.itemId.prefix("a") val aItem = RAttachment.Columns.itemId.prefix("a")
val aId = RAttachment.Columns.id.prefix("a") val aId = RAttachment.Columns.id.prefix("a")
val aFileId = RAttachment.Columns.fileId.prefix("a") val aFileId = RAttachment.Columns.fileId.prefix("a")
val iId = RItem.Columns.id.prefix("i") val iId = RItem.Columns.id.prefix("i")
val iColl = RItem.Columns.cid.prefix("i") val iColl = RItem.Columns.cid.prefix("i")
val sId = RAttachmentSource.Columns.id.prefix("s") val sId = RAttachmentSource.Columns.id.prefix("s")
val sFileId = RAttachmentSource.Columns.fileId.prefix("s") val sFileId = RAttachmentSource.Columns.fileId.prefix("s")
val m1Id = RFileMeta.Columns.id.prefix("m1") val rId = RAttachmentArchive.Columns.id.prefix("r")
val m2Id = RFileMeta.Columns.id.prefix("m2") val rFileId = RAttachmentArchive.Columns.fileId.prefix("r")
val m1Id = RFileMeta.Columns.id.prefix("m1")
val m2Id = RFileMeta.Columns.id.prefix("m2")
val m3Id = RFileMeta.Columns.id.prefix("m3")
val m1Checksum = RFileMeta.Columns.checksum.prefix("m1") val m1Checksum = RFileMeta.Columns.checksum.prefix("m1")
val m2Checksum = RFileMeta.Columns.checksum.prefix("m2") val m2Checksum = RFileMeta.Columns.checksum.prefix("m2")
val m3Checksum = RFileMeta.Columns.checksum.prefix("m3")
val from = RItem.table ++ fr"i INNER JOIN" ++ RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ val from =
fr"INNER JOIN" ++ RAttachmentSource.table ++ fr"s ON" ++ aId.is(sId) ++ RItem.table ++ fr"i INNER JOIN" ++ RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++
fr"INNER JOIN" ++ RFileMeta.table ++ fr"m1 ON" ++ m1Id.is(aFileId) ++ fr"INNER JOIN" ++ RAttachmentSource.table ++ fr"s ON" ++ aId.is(sId) ++
fr"INNER JOIN" ++ RFileMeta.table ++ fr"m2 ON" ++ m2Id.is(sFileId) fr"INNER JOIN" ++ RFileMeta.table ++ fr"m1 ON" ++ m1Id.is(aFileId) ++
selectSimple(IC, from, and(or(m1Checksum.is(checksum), m2Checksum.is(checksum)), iColl.is(collective))) fr"INNER JOIN" ++ RFileMeta.table ++ fr"m2 ON" ++ m2Id.is(sFileId) ++
.query[RItem] fr"LEFT OUTER JOIN" ++ RAttachmentArchive.table ++ fr"r ON" ++ aId.is(rId) ++
fr"INNER JOIN" ++ RFileMeta.table ++ fr"m3 ON" ++ m3Id.is(rFileId)
selectSimple(
IC,
from,
and(
or(m1Checksum.is(checksum), m2Checksum.is(checksum), m3Checksum.is(checksum)),
iColl.is(collective)
)
).query[RItem]
.to[Vector] .to[Vector]
} }