mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 23:25:58 +00:00
Add a limit for text analysis
This commit is contained in:
parent
14a25fe23e
commit
6a1297fc95
@ -0,0 +1,62 @@
|
|||||||
|
package docspell.analysis
|
||||||
|
|
||||||
|
import cats.effect._
|
||||||
|
import cats.implicits._
|
||||||
|
import docspell.analysis.contact.Contact
|
||||||
|
import docspell.analysis.date.DateFind
|
||||||
|
import docspell.analysis.nlp.StanfordNerClassifier
|
||||||
|
import docspell.common._
|
||||||
|
|
||||||
|
trait TextAnalyser[F[_]] {
|
||||||
|
|
||||||
|
def annotate(logger: Logger[F], lang: Language, text: String): F[TextAnalyser.Result]
|
||||||
|
|
||||||
|
}
|
||||||
|
object TextAnalyser {
|
||||||
|
|
||||||
|
case class Result(labels: Vector[NerLabel], dates: Vector[NerDateLabel]) {
|
||||||
|
|
||||||
|
def all: Vector[NerLabel] =
|
||||||
|
labels ++ dates.map(dl => dl.label.copy(label = dl.date.toString))
|
||||||
|
}
|
||||||
|
|
||||||
|
def create[F[_]: Sync](cfg: TextAnalysisConfig): Resource[F, TextAnalyser[F]] =
|
||||||
|
Resource.pure[F, TextAnalyser[F]](new TextAnalyser[F] {
|
||||||
|
def annotate(
|
||||||
|
logger: Logger[F],
|
||||||
|
lang: Language,
|
||||||
|
text: String
|
||||||
|
): F[TextAnalyser.Result] =
|
||||||
|
for {
|
||||||
|
input <- textLimit(logger, text)
|
||||||
|
tags0 <- stanfordNer(lang, input)
|
||||||
|
tags1 <- contactNer(input)
|
||||||
|
dates <- dateNer(lang, input)
|
||||||
|
list = tags0 ++ tags1
|
||||||
|
spans = NerLabelSpan.build(list)
|
||||||
|
} yield Result(spans ++ list, dates)
|
||||||
|
|
||||||
|
private def textLimit(logger: Logger[F], text: String): F[String] =
|
||||||
|
if (text.length <= cfg.maxLength) text.pure[F]
|
||||||
|
else
|
||||||
|
logger.info(
|
||||||
|
s"The text to analyse is larger than limit (${text.length} > ${cfg.maxLength})." +
|
||||||
|
s" Analysing only first ${cfg.maxLength} characters."
|
||||||
|
) *> text.take(cfg.maxLength).pure[F]
|
||||||
|
|
||||||
|
private def stanfordNer(lang: Language, text: String): F[Vector[NerLabel]] =
|
||||||
|
Sync[F].delay {
|
||||||
|
StanfordNerClassifier.nerAnnotate(lang)(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def contactNer(text: String): F[Vector[NerLabel]] = Sync[F].delay {
|
||||||
|
Contact.annotate(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def dateNer(lang: Language, text: String): F[Vector[NerDateLabel]] =
|
||||||
|
Sync[F].delay {
|
||||||
|
DateFind.findDates(text, lang).toVector
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,5 @@
|
|||||||
|
package docspell.analysis
|
||||||
|
|
||||||
|
case class TextAnalysisConfig(
|
||||||
|
maxLength: Int
|
||||||
|
)
|
@ -193,6 +193,18 @@ docspell.joex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Settings for text analysis
|
||||||
|
text-analysis {
|
||||||
|
# Maximum length of text to be analysed.
|
||||||
|
#
|
||||||
|
# All text to analyse must fit into RAM. A large document may take
|
||||||
|
# too much heap. Also, most important information is at the
|
||||||
|
# beginning of a document, so in most cases the first two pages
|
||||||
|
# should suffice. Default is 10000, which are about 2-3 pages
|
||||||
|
# (just a rough guess, of course).
|
||||||
|
max-length = 10000
|
||||||
|
}
|
||||||
|
|
||||||
# Configuration for converting files into PDFs.
|
# Configuration for converting files into PDFs.
|
||||||
#
|
#
|
||||||
# Most of it is delegated to external tools, which can be configured
|
# Most of it is delegated to external tools, which can be configured
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package docspell.joex
|
package docspell.joex
|
||||||
|
|
||||||
|
import docspell.analysis.TextAnalysisConfig
|
||||||
import docspell.common.{Ident, LenientUri}
|
import docspell.common.{Ident, LenientUri}
|
||||||
import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
|
import docspell.joex.scheduler.{PeriodicSchedulerConfig, SchedulerConfig}
|
||||||
import docspell.store.JdbcConfig
|
import docspell.store.JdbcConfig
|
||||||
@ -16,6 +17,7 @@ case class Config(
|
|||||||
periodicScheduler: PeriodicSchedulerConfig,
|
periodicScheduler: PeriodicSchedulerConfig,
|
||||||
houseKeeping: HouseKeepingConfig,
|
houseKeeping: HouseKeepingConfig,
|
||||||
extraction: ExtractConfig,
|
extraction: ExtractConfig,
|
||||||
|
textAnalysis: TextAnalysisConfig,
|
||||||
convert: ConvertConfig
|
convert: ConvertConfig
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ package docspell.joex.process
|
|||||||
|
|
||||||
import cats.effect._
|
import cats.effect._
|
||||||
import docspell.common.ProcessItemArgs
|
import docspell.common.ProcessItemArgs
|
||||||
|
import docspell.analysis.TextAnalysisConfig
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
import docspell.joex.Config
|
import docspell.joex.Config
|
||||||
|
|
||||||
@ -14,13 +15,15 @@ object ProcessItem {
|
|||||||
.flatMap(ConvertPdf(cfg.convert, _))
|
.flatMap(ConvertPdf(cfg.convert, _))
|
||||||
.flatMap(TextExtraction(cfg.extraction, _))
|
.flatMap(TextExtraction(cfg.extraction, _))
|
||||||
.flatMap(Task.setProgress(50))
|
.flatMap(Task.setProgress(50))
|
||||||
.flatMap(analysisOnly[F])
|
.flatMap(analysisOnly[F](cfg.textAnalysis))
|
||||||
.flatMap(Task.setProgress(75))
|
.flatMap(Task.setProgress(75))
|
||||||
.flatMap(LinkProposal[F])
|
.flatMap(LinkProposal[F])
|
||||||
.flatMap(Task.setProgress(99))
|
.flatMap(Task.setProgress(99))
|
||||||
|
|
||||||
def analysisOnly[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
def analysisOnly[F[_]: Sync](
|
||||||
TextAnalysis[F](item)
|
cfg: TextAnalysisConfig
|
||||||
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
|
TextAnalysis[F](cfg)(item)
|
||||||
.flatMap(FindProposal[F])
|
.flatMap(FindProposal[F])
|
||||||
.flatMap(EvalProposals[F])
|
.flatMap(EvalProposals[F])
|
||||||
.flatMap(SaveProposals[F])
|
.flatMap(SaveProposals[F])
|
||||||
|
@ -2,9 +2,7 @@ package docspell.joex.process
|
|||||||
|
|
||||||
import cats.implicits._
|
import cats.implicits._
|
||||||
import cats.effect.Sync
|
import cats.effect.Sync
|
||||||
import docspell.analysis.nlp._
|
import docspell.analysis.{TextAnalyser, TextAnalysisConfig}
|
||||||
import docspell.analysis.contact._
|
|
||||||
import docspell.analysis.date._
|
|
||||||
import docspell.common._
|
import docspell.common._
|
||||||
import docspell.joex.process.ItemData.AttachmentDates
|
import docspell.joex.process.ItemData.AttachmentDates
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
@ -12,50 +10,34 @@ import docspell.store.records.RAttachmentMeta
|
|||||||
|
|
||||||
object TextAnalysis {
|
object TextAnalysis {
|
||||||
|
|
||||||
def apply[F[_]: Sync](item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
def apply[F[_]: Sync](
|
||||||
|
cfg: TextAnalysisConfig
|
||||||
|
)(item: ItemData): Task[F, ProcessItemArgs, ItemData] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
for {
|
TextAnalyser.create[F](cfg).use { analyser =>
|
||||||
_ <- ctx.logger.info("Starting text analysis")
|
for {
|
||||||
s <- Duration.stopTime[F]
|
_ <- ctx.logger.info("Starting text analysis")
|
||||||
t <- item.metas.toList.traverse(annotateAttachment[F](ctx.args.meta.language))
|
s <- Duration.stopTime[F]
|
||||||
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
|
t <- item.metas.toList
|
||||||
_ <- t.traverse(m =>
|
.traverse(annotateAttachment[F](ctx.args.meta.language, ctx.logger, analyser))
|
||||||
ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
|
_ <- ctx.logger.debug(s"Storing tags: ${t.map(_._1.copy(content = None))}")
|
||||||
)
|
_ <- t.traverse(m =>
|
||||||
e <- s
|
ctx.store.transact(RAttachmentMeta.updateLabels(m._1.id, m._1.nerlabels))
|
||||||
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
)
|
||||||
v = t.toVector
|
e <- s
|
||||||
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
_ <- ctx.logger.info(s"Text-Analysis finished in ${e.formatExact}")
|
||||||
|
v = t.toVector
|
||||||
|
} yield item.copy(metas = v.map(_._1), dateLabels = v.map(_._2))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def annotateAttachment[F[_]: Sync](
|
def annotateAttachment[F[_]: Sync](
|
||||||
lang: Language
|
lang: Language,
|
||||||
|
logger: Logger[F],
|
||||||
|
analyser: TextAnalyser[F]
|
||||||
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] =
|
)(rm: RAttachmentMeta): F[(RAttachmentMeta, AttachmentDates)] =
|
||||||
for {
|
for {
|
||||||
list0 <- stanfordNer[F](lang, rm)
|
labels <- analyser.annotate(logger, lang, rm.content.getOrElse(""))
|
||||||
list1 <- contactNer[F](rm)
|
} yield (rm.copy(nerlabels = labels.all.toList), AttachmentDates(rm, labels.dates))
|
||||||
list = list0 ++ list1
|
|
||||||
spans = NerLabelSpan.build(list.toSeq)
|
|
||||||
dates <- dateNer[F](rm, lang)
|
|
||||||
} yield (rm.copy(nerlabels = (spans ++ list ++ dates.toNerLabel).toList), dates)
|
|
||||||
|
|
||||||
def stanfordNer[F[_]: Sync](lang: Language, rm: RAttachmentMeta): F[Vector[NerLabel]] =
|
|
||||||
Sync[F].delay {
|
|
||||||
rm.content.map(StanfordNerClassifier.nerAnnotate(lang)).getOrElse(Vector.empty)
|
|
||||||
}
|
|
||||||
|
|
||||||
def contactNer[F[_]: Sync](rm: RAttachmentMeta): F[Vector[NerLabel]] = Sync[F].delay {
|
|
||||||
rm.content.map(Contact.annotate).getOrElse(Vector.empty)
|
|
||||||
}
|
|
||||||
|
|
||||||
def dateNer[F[_]: Sync](rm: RAttachmentMeta, lang: Language): F[AttachmentDates] =
|
|
||||||
Sync[F].delay {
|
|
||||||
AttachmentDates(
|
|
||||||
rm,
|
|
||||||
rm.content
|
|
||||||
.map(txt => DateFind.findDates(txt, lang).toVector)
|
|
||||||
.getOrElse(Vector.empty)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -78,6 +78,9 @@ let
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
text-analysis = {
|
||||||
|
max-length = 10000;
|
||||||
|
};
|
||||||
convert = {
|
convert = {
|
||||||
chunk-size = 524288;
|
chunk-size = 524288;
|
||||||
max-image-size = 14000000;
|
max-image-size = 14000000;
|
||||||
@ -530,6 +533,29 @@ in {
|
|||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
text-analysis = mkOption {
|
||||||
|
type = types.submodule({
|
||||||
|
options = {
|
||||||
|
max-length = mkOption {
|
||||||
|
type = types.int;
|
||||||
|
default = defaults.text-analysis.max-length;
|
||||||
|
description = ''
|
||||||
|
Maximum length of text to be analysed.
|
||||||
|
|
||||||
|
All text to analyse must fit into RAM. A large document may take
|
||||||
|
too much heap. Also, most important information is at the
|
||||||
|
beginning of a document, so in most cases the first two pages
|
||||||
|
should suffice. Default is 10000, which are about 2-3 pages
|
||||||
|
(a rough guess).
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
});
|
||||||
|
default = defaults.text-analysis;
|
||||||
|
description = "Settings for text analysis";
|
||||||
|
};
|
||||||
|
|
||||||
convert = mkOption {
|
convert = mkOption {
|
||||||
type = types.submodule({
|
type = types.submodule({
|
||||||
options = {
|
options = {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user