Implement learning a text classifier from collective data

This commit is contained in:
Eike Kettner
2020-09-01 07:50:21 +02:00
parent 68bb65572b
commit 316b490008
5 changed files with 130 additions and 18 deletions

View File

@ -67,8 +67,8 @@ trait DoobieSyntax {
Fragment.const(" FROM ") ++ table ++ this.where(where)
def selectDistinct(cols: Seq[Column], table: Fragment, where: Fragment): Fragment =
Fragment.const("SELECT DISTINCT(") ++ commas(cols.map(_.f)) ++
Fragment.const(") FROM ") ++ table ++ this.where(where)
Fragment.const("SELECT DISTINCT ") ++ commas(cols.map(_.f)) ++
Fragment.const(" FROM ") ++ table ++ this.where(where)
def selectCount(col: Column, table: Fragment, where: Fragment): Fragment =
Fragment.const("SELECT COUNT(") ++ col.f ++ Fragment.const(") FROM ") ++ table ++ this

View File

@ -7,6 +7,7 @@ import cats.effect.concurrent.Ref
import cats.implicits._
import fs2.Stream
import docspell.common.syntax.all._
import docspell.common.{IdRef, _}
import docspell.store.Store
import docspell.store.impl.Implicits._
@ -615,4 +616,74 @@ object QItem {
.query[NameAndNotes]
.streamWithChunkSize(chunkSize)
}
def findAllNewesFirst(
collective: Ident,
chunkSize: Int
): Stream[ConnectionIO, Ident] = {
val cols = Seq(RItem.Columns.id)
(selectSimple(cols, RItem.table, RItem.Columns.cid.is(collective)) ++
orderBy(RItem.Columns.created.desc))
.query[Ident]
.streamWithChunkSize(chunkSize)
}
case class TagName(id: Ident, name: String)
case class TextAndTag(itemId: Ident, text: String, tag: Option[TagName])
def resolveTextAndTag(
collective: Ident,
itemId: Ident,
tagCategory: String
): ConnectionIO[TextAndTag] = {
val aId = RAttachment.Columns.id.prefix("a")
val aItem = RAttachment.Columns.itemId.prefix("a")
val mId = RAttachmentMeta.Columns.id.prefix("m")
val mText = RAttachmentMeta.Columns.content.prefix("m")
val tiItem = RTagItem.Columns.itemId.prefix("ti")
val tiTag = RTagItem.Columns.tagId.prefix("ti")
val tId = RTag.Columns.tid.prefix("t")
val tName = RTag.Columns.name.prefix("t")
val tCat = RTag.Columns.category.prefix("t")
val iId = RItem.Columns.id.prefix("i")
val iColl = RItem.Columns.cid.prefix("i")
val cte = withCTE(
"tags" -> selectSimple(
Seq(tiItem, tId, tName),
RTagItem.table ++ fr"ti INNER JOIN" ++
RTag.table ++ fr"t ON" ++ tId.is(tiTag),
and(tiItem.is(itemId), tCat.is(tagCategory))
)
)
val cols = Seq(mText, tId, tName)
val from = RItem.table ++ fr"i INNER JOIN" ++
RAttachment.table ++ fr"a ON" ++ aItem.is(iId) ++ fr"INNER JOIN" ++
RAttachmentMeta.table ++ fr"m ON" ++ aId.is(mId) ++ fr"LEFT JOIN" ++
fr"tags t ON" ++ RTagItem.Columns.itemId.prefix("t").is(iId)
val where =
and(
iId.is(itemId),
iColl.is(collective),
mText.isNotNull,
mText.isNot("")
)
val q = cte ++ selectDistinct(cols, from, where)
for {
_ <- logger.ftrace[ConnectionIO](
s"query: $q (${itemId.id}, ${collective.id}, ${tagCategory})"
)
texts <- q.query[(String, Option[TagName])].to[List]
_ <- logger.ftrace[ConnectionIO](
s"Got ${texts.size} text and tag entries for item ${itemId.id}"
)
tag = texts.headOption.flatMap(_._2)
txt = texts.map(_._1).mkString(" --n-- ")
} yield TextAndTag(itemId, txt, tag)
}
}

View File

@ -61,6 +61,9 @@ object RClassifierSetting {
sql.update.run
}
def updateFile(coll: Ident, fid: Ident): ConnectionIO[Int] =
updateRow(table, cid.is(coll), fileId.setTo(fid)).update.run
def updateSettings(v: RClassifierSetting): ConnectionIO[Int] =
for {
n1 <- updateRow(