mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Implement learning a text classifier from collective data
This commit is contained in:
@ -67,8 +67,8 @@ trait DoobieSyntax {
|
||||
Fragment.const(" FROM ") ++ table ++ this.where(where)
|
||||
|
||||
def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment =
|
||||
Fragment.const("SELECT DISTINCT(") ++ commas(cols.map(_.f)) ++
|
||||
Fragment.const(") FROM ") ++ table ++ this.where(where)
|
||||
Fragment.const("SELECT DISTINCT ") ++ commas(cols.map(_.f)) ++
|
||||
Fragment.const(" FROM ") ++ table ++ this.where(where)
|
||||
|
||||
def selectCount(col: Column, table: Fragment, where: Fragment): Fragment =
|
||||
Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this
|
||||
|
@ -7,6 +7,7 @@ import cats.effect.concurrent.Ref
|
||||
import cats.implicits._
|
||||
import fs2.Stream
|
||||
|
||||
import docspell.common.syntax.all._
|
||||
import docspell.common.{IdRef, _}
|
||||
import docspell.store.Store
|
||||
import docspell.store.impl.Implicits._
|
||||
@ -615,4 +616,74 @@ object QItem {
|
||||
.query[NameAndNotes]
|
||||
.streamWithChunkSize(chunkSize)
|
||||
}
|
||||
|
||||
def findAllNewesFirst(
|
||||
collective: Ident,
|
||||
chunkSize: Int
|
||||
): Stream[ConnectionIO, Ident] = {
|
||||
val cols = Seq(RItem.Columns.id)
|
||||
(selectSimple(cols, RItem.table, RItem.Columns.cid.is(collective)) ++
|
||||
orderBy(RItem.Columns.created.desc))
|
||||
.query[Ident]
|
||||
.streamWithChunkSize(chunkSize)
|
||||
}
|
||||
|
||||
case class TagName(id: Ident, name: String)
|
||||
case class TextAndTag(itemId: Ident, text: String, tag: Option[TagName])
|
||||
|
||||
def resolveTextAndTag(
|
||||
collective: Ident,
|
||||
itemId: Ident,
|
||||
tagCategory: String
|
||||
): ConnectionIO[TextAndTag] = {
|
||||
val aId = RAttachment.Columns.id.prefix("a")
|
||||
val aItem = RAttachment.Columns.itemId.prefix("a")
|
||||
val mId = RAttachmentMeta.Columns.id.prefix("m")
|
||||
val mText = RAttachmentMeta.Columns.content.prefix("m")
|
||||
val tiItem = RTagItem.Columns.itemId.prefix("ti")
|
||||
val tiTag = RTagItem.Columns.tagId.prefix("ti")
|
||||
val tId = RTag.Columns.tid.prefix("t")
|
||||
val tName = RTag.Columns.name.prefix("t")
|
||||
val tCat = RTag.Columns.category.prefix("t")
|
||||
val iId = RItem.Columns.id.prefix("i")
|
||||
val iColl = RItem.Columns.cid.prefix("i")
|
||||
|
||||
val cte = withCTE(
|
||||
"tags" -> selectSimple(
|
||||
Seq(tiItem, tId, tName),
|
||||
RTagItem.table ++ fr"ti INNER JOIN" ++
|
||||
RTag.table ++ fr"t ON" ++ tId.is(tiTag),
|
||||
and(tiItem.is(itemId), tCat.is(tagCategory))
|
||||
)
|
||||
)
|
||||
|
||||
val cols = Seq(mText, tId, tName)
|
||||
|
||||
val from = RItem.table ++ fr"i INNER JOIN" ++
|
||||
RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ fr"INNER JOIN" ++
|
||||
RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ fr"LEFT JOIN" ++
|
||||
fr"tags t ON" ++ RTagItem.Columns.itemId.prefix("t").is(iId)
|
||||
|
||||
val where =
|
||||
and(
|
||||
iId.is(itemId),
|
||||
iColl.is(collective),
|
||||
mText.isNotNull,
|
||||
mText.isNot("")
|
||||
)
|
||||
|
||||
val q = cte ++ selectDistinct(cols, from, where)
|
||||
for {
|
||||
_ <- logger.ftrace[ConnectionIO](
|
||||
s"query: $q (${itemId.id}, ${collective.id}, ${tagCategory})"
|
||||
)
|
||||
texts <- q.query[(String, Option[TagName])].to[List]
|
||||
_ <- logger.ftrace[ConnectionIO](
|
||||
s"Got ${texts.size} text and tag entries for item ${itemId.id}"
|
||||
)
|
||||
tag = texts.headOption.flatMap(_._2)
|
||||
txt = texts.map(_._1).mkString(" --n-- ")
|
||||
} yield TextAndTag(itemId, txt, tag)
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -61,6 +61,9 @@ object RClassifierSetting {
|
||||
sql.update.run
|
||||
}
|
||||
|
||||
def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] =
|
||||
updateRow(table, cid.is(coll), fileId.setTo(fid)).update.run
|
||||
|
||||
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
|
||||
for {
|
||||
n1 <- updateRow(
|
||||
|
Reference in New Issue
Block a user