Merge pull request #56 from eikek/feature/analysis

Feature/analysis
This commit is contained in:
eikek 2020-03-18 00:00:47 +01:00 committed by GitHub
commit 2a7066650f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 295 additions and 45 deletions

View File

@ -40,6 +40,13 @@ val testSettings = Seq(
libraryDependencies ++= Dependencies.miniTest ++ Dependencies.logging.map(_ % Test)
)
lazy val noPublish = Seq(
publish := {},
publishLocal := {},
publishArtifact := false
)
val elmSettings = Seq(
elmCompileMode := ElmCompileMode.Debug,
Compile/resourceGenerators += Def.task {
@ -424,6 +431,7 @@ val microsite = project.in(file("modules/microsite")).
val root = project.in(file(".")).
settings(sharedSettings).
settings(noPublish).
settings(
name := "docspell-root"
).

View File

@ -0,0 +1,56 @@
package docspell.common
final case class NerLabelSpan private (
labels: Vector[NerLabel]
) {
def size: Int = labels.size
def +(label: NerLabel): NerLabelSpan =
labels.lastOption match {
case None =>
NerLabelSpan(Vector(label))
case Some(el) =>
if (label.startPosition - el.endPosition == 1) NerLabelSpan(labels :+ label)
else this
}
def asLabel: Option[NerLabel] =
(labels.headOption, labels.lastOption) match {
case (Some(s), Some(e)) =>
Some(
NerLabel(
labels.map(_.label).mkString(" "),
s.tag,
s.startPosition,
e.endPosition
)
)
case _ =>
None
}
}
object NerLabelSpan {
val empty = NerLabelSpan(Vector.empty)
def buildSpans(labels: Seq[NerLabel]): Vector[NerLabelSpan] = {
val sorted = labels.sortBy(_.startPosition)
sorted
.foldLeft(Vector.empty[NerLabelSpan]) { (span, el) =>
span.lastOption match {
case Some(last) =>
val next = last + el
if (next eq last) span :+ (empty + el)
else span.dropRight(1) :+ next
case None =>
Vector(empty + el)
}
}
.filter(_.size > 1)
}
def build(labels: Seq[NerLabel]): Vector[NerLabel] =
buildSpans(labels).flatMap(_.asLabel)
}

View File

@ -2,6 +2,8 @@ package docspell.common
import java.util.concurrent.atomic.AtomicLong
import java.util.concurrent.{Executors, ThreadFactory}
import cats.effect._
import scala.concurrent._
object ThreadFactories {
@ -17,4 +19,16 @@ object ThreadFactories {
}
}
def executorResource[F[_]: Sync](
c: => ExecutionContextExecutorService
): Resource[F, ExecutionContextExecutorService] =
Resource.make(Sync[F].delay(c))(ec => Sync[F].delay(ec.shutdown))
def cached[F[_]: Sync](tf: ThreadFactory): Resource[F, ExecutionContextExecutorService] =
executorResource(
ExecutionContext.fromExecutorService(Executors.newCachedThreadPool(tf))
)
def fixed[F[_]: Sync](n: Int, tf: ThreadFactory): Resource[F, ExecutionContextExecutorService] =
executorResource(ExecutionContext.fromExecutorService(Executors.newFixedThreadPool(n, tf)))
}

View File

@ -0,0 +1,45 @@
package docspell.common
import minitest._
object NerLabelSpanTest extends SimpleTestSuite {
test("build") {
val labels = List(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Derek", NerTag.Person, 68, 73),
NerLabel("Jeter", NerTag.Person, 74, 79),
NerLabel("Treesville", NerTag.Location, 95, 105),
NerLabel("Syrup", NerTag.Organization, 162, 167),
NerLabel("Production", NerTag.Organization, 168, 178),
NerLabel("Old", NerTag.Organization, 179, 182),
NerLabel("Sticky", NerTag.Organization, 183, 189),
NerLabel("Pancake", NerTag.Organization, 190, 197),
NerLabel("Company", NerTag.Organization, 198, 205),
NerLabel("Maple", NerTag.Location, 210, 215),
NerLabel("Lane", NerTag.Location, 216, 220),
NerLabel("Forest", NerTag.Location, 222, 238),
NerLabel("Hemptown", NerTag.Location, 243, 251),
NerLabel("Little", NerTag.Organization, 351, 357),
NerLabel("League", NerTag.Organization, 358, 364),
NerLabel("Derek", NerTag.Person, 1121, 1126),
NerLabel("Jeter", NerTag.Person, 1127, 1132),
NerLabel("2016-11-07", NerTag.Date, 50, 60),
NerLabel("2016-11-07", NerTag.Date, 119, 129),
NerLabel("2019-09-03", NerTag.Date, 253, 264),
NerLabel("2016-12-12", NerTag.Date, 1080, 1091)
)
val spans = NerLabelSpan.build(labels)
assertEquals(spans, Vector(
NerLabel("Derek Jeter", NerTag.Person, 0, 11),
NerLabel("Derek Jeter", NerTag.Person, 68, 79),
NerLabel("Syrup Production Old Sticky Pancake Company", NerTag.Organization, 162, 205),
NerLabel("Maple Lane", NerTag.Location, 210, 220),
NerLabel("Little League", NerTag.Organization, 351, 364),
NerLabel("Derek Jeter", NerTag.Person, 1121, 1132)
))
}
}

View File

@ -3,9 +3,8 @@ package docspell.joex
import cats.effect.{Blocker, ExitCode, IO, IOApp}
import cats.implicits._
import scala.concurrent.ExecutionContext
import java.util.concurrent.Executors
import java.nio.file.{Files, Paths}
import scala.concurrent.ExecutionContext
import docspell.common.{Banner, ThreadFactories}
import org.log4s._
@ -13,13 +12,8 @@ import org.log4s._
object Main extends IOApp {
private[this] val logger = getLogger
val blockingEC: ExecutionContext = ExecutionContext.fromExecutor(
Executors.newCachedThreadPool(ThreadFactories.ofName("docspell-joex-blocking"))
)
val blocker = Blocker.liftExecutionContext(blockingEC)
val connectEC: ExecutionContext = ExecutionContext.fromExecutorService(
Executors.newFixedThreadPool(5, ThreadFactories.ofName("docspell-joex-dbconnect"))
)
val blockingEC = ThreadFactories.cached[IO](ThreadFactories.ofName("docspell-joex-blocking"))
val connectEC = ThreadFactories.fixed[IO](5, ThreadFactories.ofName("docspell-joex-dbconnect"))
def run(args: List[String]) = {
args match {
@ -52,6 +46,15 @@ object Main extends IOApp {
cfg.baseUrl
)
logger.info(s"\n${banner.render("***>")}")
JoexServer.stream[IO](cfg, connectEC, blockingEC, blocker).compile.drain.as(ExitCode.Success)
val pools = for {
cec <- connectEC
bec <- blockingEC
blocker = Blocker.liftExecutorService(bec)
} yield Pools(cec, bec, blocker)
pools.use(p =>
JoexServer.stream[IO](cfg, p.connectEC, p.clientEC, p.blocker).compile.drain.as(ExitCode.Success)
)
}
case class Pools(connectEC: ExecutionContext, clientEC: ExecutionContext, blocker: Blocker)
}

View File

@ -0,0 +1,66 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
/** Reorders the proposals to put most probable fits first.
*/
object EvalProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { _ =>
val metas = data.metas.map(reorderCandidates)
data.copy(metas = metas).pure[F]
}
def reorderCandidates(rm: RAttachmentMeta): RAttachmentMeta = {
val list = rm.proposals.getTypes.toList
.map(mpt => rm.proposals.find(mpt) match {
case Some(mp) =>
val v = mp.values.sortBy(weight(rm, mp))
Some(mp.copy(values = v))
case None =>
None
})
rm.copy(proposals = MetaProposalList(list.flatMap(identity)))
}
def weight(rm: RAttachmentMeta, mp: MetaProposal)(cand: MetaProposal.Candidate): Double = {
val textLen = rm.content.map(_.length).getOrElse(0)
val tagCount = cand.origin.size.toDouble
val pos = cand.origin.map(_.startPosition).min
val words = cand.origin.map(_.label.split(' ').length).max.toDouble
val nerFac = cand.origin.map(label => nerTagFactor(label.tag, mp.proposalType)).min
(1 / words) * (1 / tagCount) * positionWeight(pos, textLen) * nerFac
}
def positionWeight(pos: Int, total: Int): Double = {
if (total <= 0) 1
else {
val p = math.abs(pos.toDouble / total.toDouble)
if (p < 0.7) p / 2
else p
}
}
def nerTagFactor(tag: NerTag, mt: MetaProposalType): Double =
tag match {
case NerTag.Date => 1.0
case NerTag.Email => 0.5
case NerTag.Location => 1.0
case NerTag.Misc => 1.0
case NerTag.Organization =>
if (mt == MetaProposalType.CorrOrg) 0.8
else 1.0
case NerTag.Person =>
if (mt == MetaProposalType.CorrPerson ||
mt == MetaProposalType.ConcPerson) 0.8
else 1.0
case NerTag.Website => 0.5
}
}

View File

@ -24,16 +24,10 @@ object FindProposal {
ctx.logger.info("Starting find-proposal") *>
rmas
.traverse(rm =>
processAttachment(rm, data.findDates(rm), ctx).map(ml => rm.copy(proposals = ml))
)
.flatMap(rmv =>
rmv
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
)
.map(_ => data.copy(metas = rmv))
processAttachment(rm, data.findDates(rm), ctx)
.map(ml => rm.copy(proposals = ml))
)
.map(rmv => data.copy(metas = rmv))
}
def processAttachment[F[_]: Sync](
@ -56,13 +50,19 @@ object FindProposal {
val dueDates = MetaProposalList.fromSeq1(
MetaProposalType.DueDate,
after.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
Candidate(
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
Set(ndl.label)
)
)
)
val itemDates = MetaProposalList.fromSeq1(
MetaProposalType.DocDate,
before.map(ndl =>
Candidate(IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString), Set(ndl.label))
Candidate(
IdRef(Ident.unsafe(ndl.date.toString), ndl.date.toString),
Set(ndl.label)
)
)
)
@ -71,13 +71,13 @@ object FindProposal {
def removeDuplicates(labels: List[NerLabel]): List[NerLabel] =
labels
.sortBy(_.startPosition)
.foldLeft((Set.empty[String], List.empty[NerLabel])) {
case ((seen, result), el) =>
if (seen.contains(el.tag.name + el.label.toLowerCase)) (seen, result)
else (seen + (el.tag.name + el.label.toLowerCase), el :: result)
}
._2
.sortBy(_.startPosition)
trait Finder[F[_]] { self =>
def find(labels: Seq[NerLabel]): F[MetaProposalList]
@ -91,7 +91,9 @@ object FindProposal {
def flatMap(f: MetaProposalList => Finder[F])(implicit F: FlatMap[F]): Finder[F] =
labels => self.find(labels).flatMap(ml => f(ml).find(labels))
def map(f: MetaProposalList => MetaProposalList)(implicit F: Applicative[F]): Finder[F] =
def map(
f: MetaProposalList => MetaProposalList
)(implicit F: Applicative[F]): Finder[F] =
labels => self.find(labels).map(f)
def next(f: Finder[F])(implicit F: FlatMap[F], F3: Applicative[F]): Finder[F] =
@ -118,10 +120,12 @@ object FindProposal {
_ => value.pure[F]
def searchExact[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
labels =>
labels.toList.traverse(nl => search(nl, true, ctx)).map(MetaProposalList.flatten)
def searchFuzzy[F[_]: Sync](ctx: Context[F, ProcessItemArgs]): Finder[F] =
labels => labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
labels =>
labels.toList.traverse(nl => search(nl, false, ctx)).map(MetaProposalList.flatten)
}
private def search[F[_]: Sync](
@ -154,10 +158,15 @@ object FindProposal {
val s2 = ctx.store
.transact(RPerson.findLike(ctx.args.meta.collective, value, false))
.map(MetaProposalList.from(MetaProposalType.CorrPerson, nt))
ctx.logger.debug(s"Looking for persons: $value") *> (for {
val s3 =
ctx.store
.transact(ROrganization.findLike(ctx.args.meta.collective, value))
.map(MetaProposalList.from(MetaProposalType.CorrOrg, nt))
ctx.logger.debug(s"Looking for persons and organizations: $value") *> (for {
ml0 <- s1
ml1 <- s2
} yield ml0 |+| ml1)
ml2 <- s3
} yield ml0 |+| ml1 |+| ml2)
case NerTag.Location =>
ctx.logger

View File

@ -16,7 +16,16 @@ object ProcessItem {
.flatMap(TextAnalysis[F])
.flatMap(Task.setProgress(50))
.flatMap(FindProposal[F])
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])
.flatMap(Task.setProgress(75))
.flatMap(LinkProposal[F])
.flatMap(Task.setProgress(99))
def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
TextAnalysis[F](item)
.flatMap(FindProposal[F])
.flatMap(EvalProposals[F])
.flatMap(SaveProposals[F])
}

View File

@ -0,0 +1,24 @@
package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common._
import docspell.joex.scheduler.Task
import docspell.store.records._
/** Saves the proposals in the database
*
*/
object SaveProposals {
def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] =
Task { ctx =>
ctx.logger.info("Storing proposals") *>
data.metas
.traverse(rm =>
ctx.logger.debug(s"Storing attachment proposals: ${rm.proposals}") *>
ctx.store.transact(RAttachmentMeta.updateProposals(rm.id, rm.proposals))
)
.map(_ => data)
}
}

View File

@ -34,8 +34,10 @@ object TextAnalysis {
for {
list0 <- stanfordNer[F](lang, rm)
list1 <- contactNer[F](rm)
list = list0 ++ list1
spans = NerLabelSpan.build(list.toSeq)
dates <- dateNer[F](rm, lang)
} yield (rm.copy(nerlabels = (list0 ++ list1 ++ dates.toNerLabel).toList), dates)
} yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
Sync[F].delay {

View File

@ -33,14 +33,14 @@ object Context {
private[this] val log = getLogger
def create[F[_]: Functor, A](
job: RJob,
jobId: Ident,
arg: A,
config: SchedulerConfig,
log: Logger[F],
store: Store[F],
blocker: Blocker
): Context[F, A] =
new ContextImpl(arg, log, store, blocker, config, job.id)
new ContextImpl(arg, log, store, blocker, config, jobId)
def apply[F[_]: Concurrent, A](
job: RJob,
@ -54,7 +54,7 @@ object Context {
_ <- log.ftrace("Creating logger for task run")
logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink)
_ <- log.ftrace("Logger created, instantiating context")
ctx = create[F, A](job, arg, config, logger, store, blocker)
ctx = create[F, A](job.id, arg, config, logger, store, blocker)
} yield ctx
final private class ContextImpl[F[_]: Functor, A](

View File

@ -4,7 +4,6 @@ import cats.effect._
import cats.implicits._
import scala.concurrent.ExecutionContext
import java.util.concurrent.Executors
import java.nio.file.{Files, Paths}
import docspell.common.{Banner, ThreadFactories}
@ -13,14 +12,8 @@ import org.log4s._
object Main extends IOApp {
private[this] val logger = getLogger
val blockingEc: ExecutionContext = ExecutionContext.fromExecutor(
Executors.newCachedThreadPool(ThreadFactories.ofName("docspell-restserver-blocking"))
)
val blocker = Blocker.liftExecutionContext(blockingEc)
val connectEC: ExecutionContext = ExecutionContext.fromExecutorService(
Executors.newFixedThreadPool(5, ThreadFactories.ofName("docspell-dbconnect"))
)
val blockingEC = ThreadFactories.cached[IO](ThreadFactories.ofName("docspell-restserver-blocking"))
val connectEC = ThreadFactories.fixed[IO](5, ThreadFactories.ofName("docspell-dbconnect"))
def run(args: List[String]) = {
args match {
@ -52,7 +45,17 @@ object Main extends IOApp {
cfg.appId,
cfg.baseUrl
)
val pools = for {
cec <- connectEC
bec <- blockingEC
blocker = Blocker.liftExecutorService(bec)
} yield Pools(cec, bec, blocker)
logger.info(s"\n${banner.render("***>")}")
RestServer.stream[IO](cfg, connectEC, blockingEc, blocker).compile.drain.as(ExitCode.Success)
pools.use(p =>
RestServer.stream[IO](cfg, p.connectEC, p.clientEC, p.blocker).compile.drain.as(ExitCode.Success)
)
}
case class Pools(connectEC: ExecutionContext, clientEC: ExecutionContext, blocker: Blocker)
}

View File

@ -24,7 +24,7 @@ object ONode {
def register(appId: Ident, nodeType: NodeType, uri: LenientUri): F[Unit] =
for {
node <- RNode(appId, nodeType, uri)
_ <- logger.finfo(s"Registering node $node")
_ <- logger.finfo(s"Registering node ${node.id.id}")
_ <- store.transact(RNode.set(node))
} yield ()

View File

@ -336,7 +336,7 @@ update key flags next msg model =
( im, ic ) =
Comp.ItemMail.init flags
in
( { model | itemDatePicker = dp, dueDatePicker = dp, itemMail = im }
( { model | itemDatePicker = dp, dueDatePicker = dp, itemMail = im, visibleAttach = 0 }
, Cmd.batch
[ getOptions flags
, Cmd.map ItemDatePickerMsg dpc
@ -435,6 +435,7 @@ update key flags next msg model =
, notesField = ViewNotes
, itemDate = item.itemDate
, dueDate = item.dueDate
, visibleAttach = 0
}
, Cmd.batch
[ c1
@ -1127,6 +1128,15 @@ renderNotes model =
]
attachmentVisible : Model -> Int -> Bool
attachmentVisible model pos =
if model.visibleAttach >= List.length model.item.attachments then
pos == 0
else
model.visibleAttach == pos
renderAttachmentsTabMenu : Model -> Html Msg
renderAttachmentsTabMenu model =
let
@ -1153,9 +1163,10 @@ renderAttachmentsTabMenu model =
a
[ classList
[ ( "item", True )
, ( "active", pos == model.visibleAttach )
, ( "active", attachmentVisible model pos )
]
, title (Maybe.withDefault "No Name" el.name)
, href ""
, onClick (SetActiveAttachment pos)
]
[ Maybe.map (Util.String.ellipsis 20) el.name
@ -1180,7 +1191,7 @@ renderAttachmentView model pos attach =
div
[ classList
[ ( "ui attached tab segment", True )
, ( "active", pos == model.visibleAttach )
, ( "active", attachmentVisible model pos )
]
]
[ div [ class "ui small secondary menu" ]