Reorganize processing code

Use separate modules for - text extraction - conversion to pdf - text analysis
2025-07-04 16:48:26 +00:00 · 2020-02-15 16:40:50 +01:00
parent 919381be1e
commit 851ee7ef0f
24 changed files with 103 additions and 60 deletions
--- a/build.sbt
+++ b/build.sbt
@ -145,14 +145,14 @@ val common = project.in(file("modules/common")).

 // Some example files for testing
 // https://file-examples.com/index.php/sample-documents-download/sample-doc-download/
-val exampleFiles = project.in(file("modules/files")).
+val files = project.in(file("modules/files")).
  disablePlugins(RevolverPlugin).
  settings(sharedSettings).
  settings(testSettings).
  settings(
    name := "docspell-files",
    libraryDependencies ++=
-      Dependencies.tika,
+      Dependencies.tika ,
    Test / sourceGenerators += Def.task {
      val base = (Test/resourceDirectory).value
      val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
@ -196,18 +196,17 @@ val store = project.in(file("modules/store")).
      Dependencies.emil
  ).dependsOn(common)

-val text = project.in(file("modules/text")).
+val extract = project.in(file("modules/extract")).
  disablePlugins(RevolverPlugin).
-  enablePlugins(NerModelsPlugin).
  settings(sharedSettings).
  settings(testSettings).
-  settings(NerModelsPlugin.nerClassifierSettings).
  settings(
-    name := "docspell-text",
+    name := "docspell-extract",
    libraryDependencies ++=
      Dependencies.fs2 ++
-      Dependencies.stanfordNlpCore 
-  ).dependsOn(common, exampleFiles % "compile->compile;test->test")
+      Dependencies.pdfbox ++
+      Dependencies.poi
+  ).dependsOn(common, files % "compile->compile;test->test")

 val convert = project.in(file("modules/convert")).
  disablePlugins(RevolverPlugin).
@ -216,9 +215,21 @@ val convert = project.in(file("modules/convert")).
  settings(
    name := "docspell-convert",
    libraryDependencies ++=
-      Dependencies.pdfbox ++
      Dependencies.flexmark
-  ).dependsOn(common, exampleFiles % "compile->compile;test->test")
+  ).dependsOn(common, files % "compile->compile;test->test")
+
+val analysis = project.in(file("modules/analysis")).
+  disablePlugins(RevolverPlugin).
+  enablePlugins(NerModelsPlugin).
+  settings(sharedSettings).
+  settings(testSettings).
+  settings(NerModelsPlugin.nerClassifierSettings).
+  settings(
+    name := "docspell-analysis",
+    libraryDependencies ++=
+      Dependencies.fs2 ++
+      Dependencies.stanfordNlpCore
+  ).dependsOn(common, files % "test->test")
  
 val restapi = project.in(file("modules/restapi")).
  disablePlugins(RevolverPlugin).
@ -272,7 +283,7 @@ val joex = project.in(file("modules/joex")).
    addCompilerPlugin(Dependencies.betterMonadicFor),
    buildInfoPackage := "docspell.joex",
    reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}")
-  ).dependsOn(store, text, convert, joexapi, restapi)
+  ).dependsOn(store, extract, convert, analysis, joexapi, restapi)

 val backend = project.in(file("modules/backend")).
  disablePlugins(RevolverPlugin).
@ -402,9 +413,10 @@ val root = project.in(file(".")).
    name := "docspell-root"
  ).
  aggregate(common
-    , text
+    , extract
    , convert
-    , exampleFiles
+    , analysis
+    , files
    , store
    , joexapi
    , joex
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala
@ -1,9 +1,10 @@
-package docspell.text.contact
+package docspell.analysis.contact

 import fs2.Stream
 import cats.implicits._
-import docspell.common.{Ident, LenientUri, NerLabel, NerTag}
-import docspell.text.split.TextSplitter
+
+import docspell.common._
+import docspell.analysis.split._

 object Contact {
  private[this] val protocols = Set("ftp", "http", "https")
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala
@ -1,4 +1,4 @@
-package docspell.text.contact
+package docspell.analysis.contact

 import cats.data.NonEmptyList
 import docspell.common.LenientUri
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala
@ -1,6 +1,6 @@
-package docspell.text.contact
+package docspell.analysis.contact

-private[text] object Tld {
+private[analysis] object Tld {

  def findTld(str: String): Option[String] =
    known.find(str.endsWith)
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@ -1,10 +1,10 @@
-package docspell.text.date
+package docspell.analysis.date

-import fs2._
 import java.time.LocalDate

-import docspell.common.{Language, NerDateLabel, NerLabel, NerTag}
-import docspell.text.split.{TextSplitter, Word}
+import fs2.{Pure, Stream}
+import docspell.common._
+import docspell.analysis.split._

 import scala.util.Try

@ -21,7 +21,7 @@ object DateFind {
          .map(sd =>
            NerDateLabel(
              sd.toLocalDate,
-              NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)
+              NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end)
            )
          )
      )
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@ -1,17 +1,17 @@
-package docspell.text.nlp
+package docspell.analysis.nlp

+import java.net.URL
 import java.util.zip.GZIPInputStream

-import docspell.common.{Language, NerLabel, NerTag}
 import edu.stanford.nlp.ie.AbstractSequenceClassifier
 import edu.stanford.nlp.ie.crf.CRFClassifier
 import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
+import org.log4s.getLogger

-import scala.jdk.CollectionConverters._
-import org.log4s._
+import docspell.common._

-import java.net.URL
 import scala.util.Using
+import scala.jdk.CollectionConverters._

 object StanfordNerClassifier {
  private[this] val logger = getLogger
--- a/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala
@ -1,4 +1,4 @@
-package docspell.text.split
+package docspell.analysis.split

 import fs2.Stream

--- a/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala
@ -1,4 +1,4 @@
-package docspell.text.split
+package docspell.analysis.split

 case class Word(value: String, begin: Int, end: Int) {
  def isEmpty: Boolean  = value.isEmpty
--- a/modules/analysis/src/test/scala/docspell/analysis/TestFiles.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/TestFiles.scala
@ -0,0 +1,21 @@
+package docspell.analysis
+
+import cats.effect.{Blocker, IO}
+import docspell.files._
+
+import scala.concurrent.ExecutionContext
+
+object TestFiles {
+  val blocker     = Blocker.liftExecutionContext(ExecutionContext.global)
+  implicit val CS = IO.contextShift(ExecutionContext.global)
+
+  lazy val letterDEText =
+    ExampleFiles.letter_de_txt
+      .readText[IO](16 * 1024, blocker)
+      .unsafeRunSync
+
+  lazy val letterENText =
+    ExampleFiles.letter_en_txt
+      .readText[IO](16 * 1024, blocker)
+      .unsafeRunSync
+}
--- a/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala
@ -1,4 +1,4 @@
-package docspell.text.contact
+package docspell.analysis.contact

 import docspell.common.{NerLabel, NerTag}
 import minitest.SimpleTestSuite
--- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala
@ -1,8 +1,8 @@
-package docspell.text.date
+package docspell.analysis.date

+import docspell.analysis.TestFiles
+import minitest.SimpleTestSuite
 import docspell.common.Language
-import docspell.text.TestFiles
-import minitest._

 object DateFindSpec extends SimpleTestSuite {

--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@ -1,8 +1,8 @@
-package docspell.text.nlp
+package docspell.analysis.nlp

-import docspell.common.{Language, NerLabel, NerTag}
-import docspell.text.TestFiles
 import minitest.SimpleTestSuite
+import docspell.analysis.TestFiles
+import docspell.common._

 object TextAnalyserSuite extends SimpleTestSuite {

--- a/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala
@ -1,6 +1,6 @@
-package docspell.text.split
+package docspell.analysis.split

-import minitest._
+import minitest.SimpleTestSuite

 object TestSplitterSpec extends SimpleTestSuite {

--- a/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala
@ -1,4 +1,4 @@
-package docspell.text.ocr
+package docspell.extract.ocr

 import java.nio.file.{Path, Paths}

--- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
@ -1,4 +1,4 @@
-package docspell.text.ocr
+package docspell.extract.ocr

 import java.nio.file.Path

@ -61,7 +61,7 @@ object Ocr {
  /** Run ghostscript to extract all pdf pages into tiff files. The
    * files are stored to a temporary location on disk and returned.
    */
-  private[text] def runGhostscript[F[_]: Sync: ContextShift](
+  private[extract] def runGhostscript[F[_]: Sync: ContextShift](
      pdf: Stream[F, Byte],
      cfg: Config,
      wd: Path,
@ -92,7 +92,7 @@ object Ocr {
  /** Run ghostscript to extract all pdf pages into tiff files. The
    * files are stored to a temporary location on disk and returned.
    */
-  private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
+  private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
      pdf: Path,
      ghostscript: SystemCommand.Config,
      wd: Path,
@ -120,7 +120,7 @@ object Ocr {
  /** Run unpaper to optimize the image for ocr. The
    * files are stored to a temporary location on disk and returned.
    */
-  private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
+  private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
      img: Path,
      unpaper: SystemCommand.Config,
      wd: Path,
@ -146,7 +146,7 @@ object Ocr {
  /** Run tesseract on the given image file and return the extracted
    * text.
    */
-  private[text] def runTesseractFile[F[_]: Sync: ContextShift](
+  private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
      img: Path,
      blocker: Blocker,
      lang: String,
@ -164,7 +164,7 @@ object Ocr {
  /** Run tesseract on the given image file and return the extracted
    * text.
    */
-  private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
+  private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
      img: Stream[F, Byte],
      blocker: Blocker,
      lang: String,
--- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
@ -1,4 +1,4 @@
-package docspell.text.ocr
+package docspell.extract.ocr

 import cats.effect.{Blocker, ContextShift, Sync}
 import docspell.common._
--- a/modules/extract/src/test/resources/logback.xml
+++ b/modules/extract/src/test/resources/logback.xml
--- a/modules/extract/src/test/scala/docspell/extract/TestFiles.scala
+++ b/modules/extract/src/test/scala/docspell/extract/TestFiles.scala
@ -1,4 +1,4 @@
-package docspell.text
+package docspell.extract

 import fs2.Stream
 import cats.effect.{Blocker, IO}
--- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
+++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
@ -1,9 +1,9 @@
-package docspell.text.ocr
+package docspell.extract.ocr

 import cats.effect.IO
 import docspell.common._
 import docspell.files._
-import docspell.text.TestFiles
+import docspell.extract.TestFiles
 import minitest.SimpleTestSuite

 object TextExtractionSuite extends SimpleTestSuite {
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@ -3,7 +3,7 @@ package docspell.joex
 import docspell.common.{Ident, LenientUri}
 import docspell.joex.scheduler.SchedulerConfig
 import docspell.store.JdbcConfig
-import docspell.text.ocr.{Config => OcrConfig}
+import docspell.extract.ocr.{Config => OcrConfig}
 import docspell.convert.ConvertConfig

 case class Config(
--- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala
@ -5,11 +5,11 @@ import java.time.ZoneId
 import cats.{Applicative, FlatMap}
 import cats.implicits._
 import cats.effect.Sync
+import docspell.analysis.contact._
 import docspell.common.MetaProposal.Candidate
 import docspell.common._
 import docspell.joex.scheduler.{Context, Task}
-import docspell.store.records.{RAttachmentMeta, REquipment, ROrganization, RPerson}
-import docspell.text.contact.Domain
+import docspell.store.records._

 /** Super simple approach to find corresponding meta data to an item
  * by looking up values from NER in the users address book.
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@ -2,13 +2,13 @@ package docspell.joex.process

 import cats.implicits._
 import cats.effect.Sync
-import docspell.common.{Duration, Language, NerLabel, ProcessItemArgs}
+import docspell.analysis.nlp._
+import docspell.analysis.contact._
+import docspell.analysis.date._
+import docspell.common._
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachmentMeta
-import docspell.text.contact.Contact
-import docspell.text.date.DateFind
-import docspell.text.nlp.StanfordNerClassifier

 object TextAnalysis {

--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@ -7,7 +7,7 @@ import docspell.common._
 import docspell.joex.scheduler.{Context, Task}
 import docspell.store.Store
 import docspell.store.records.{RAttachment, RAttachmentMeta}
-import docspell.text.ocr.{TextExtract, Config => OcrConfig}
+import docspell.extract.ocr.{TextExtract, Config => OcrConfig}

 object TextExtraction {

--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@ -25,6 +25,7 @@ object Dependencies {
  val PoiVersion = "4.1.1"
  val PostgresVersion = "42.2.10"
  val PureConfigVersion = "0.12.2"
+  val Slf4jVersion = "1.7.30"
  val SqliteVersion = "3.30.1"
  val StanfordNlpVersion = "3.9.2"
  val TikaVersion = "1.23"
@ -38,6 +39,8 @@ object Dependencies {
  val poi = Seq(
    "org.apache.poi" % "poi" % PoiVersion,
    "org.apache.poi" % "poi-ooxml" % PoiVersion,
+    "org.slf4j" % "slf4j-log4j12" % Slf4jVersion,
+    "org.slf4j" % "slf4j-jcl" % Slf4jVersion
  ).map(_.excludeAll(
    ExclusionRule("commons-logging"),
    ExclusionRule("log4j")
@ -54,11 +57,17 @@ object Dependencies {
    ExclusionRule("hamcrest-core")
  ))

+  val twelvemonkeys = Seq(
+    "com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
+    "com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
+  )
+
  val pdfbox = Seq(
    "org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll(
      ExclusionRule("commons-logging"),
      ExclusionRule("org.bouncycastle")
-    )
+    ),
+    "org.slf4j" % "slf4j-jcl" % Slf4jVersion    
  )

  val emil = Seq(