Reorganize processing code

Use separate modules for - text extraction - conversion to pdf - text analysis
2025-09-15 21:46:53 +00:00 · 2020-02-15 16:40:50 +01:00
parent 919381be1e
commit 851ee7ef0f
24 changed files with 103 additions and 60 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala
@@ -1,9 +1,10 @@
-package docspell.text.contact
+package docspell.analysis.contact

 import fs2.Stream
 import cats.implicits._
-import docspell.common.{Ident, LenientUri, NerLabel, NerTag}
-import docspell.text.split.TextSplitter
+
+import docspell.common._
+import docspell.analysis.split._

 object Contact {
  private[this] val protocols = Set("ftp", "http", "https")
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala
@@ -1,4 +1,4 @@
-package docspell.text.contact
+package docspell.analysis.contact

 import cats.data.NonEmptyList
 import docspell.common.LenientUri
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala
@@ -1,6 +1,6 @@
-package docspell.text.contact
+package docspell.analysis.contact

-private[text] object Tld {
+private[analysis] object Tld {

  def findTld(str: String): Option[String] =
    known.find(str.endsWith)
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@@ -1,10 +1,10 @@
-package docspell.text.date
+package docspell.analysis.date

-import fs2._
 import java.time.LocalDate

-import docspell.common.{Language, NerDateLabel, NerLabel, NerTag}
-import docspell.text.split.{TextSplitter, Word}
+import fs2.{Pure, Stream}
+import docspell.common._
+import docspell.analysis.split._

 import scala.util.Try

@@ -21,7 +21,7 @@ object DateFind {
          .map(sd =>
            NerDateLabel(
              sd.toLocalDate,
-              NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)
+              NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end)
            )
          )
      )
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@@ -1,17 +1,17 @@
-package docspell.text.nlp
+package docspell.analysis.nlp

+import java.net.URL
 import java.util.zip.GZIPInputStream

-import docspell.common.{Language, NerLabel, NerTag}
 import edu.stanford.nlp.ie.AbstractSequenceClassifier
 import edu.stanford.nlp.ie.crf.CRFClassifier
 import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
+import org.log4s.getLogger

-import scala.jdk.CollectionConverters._
-import org.log4s._
+import docspell.common._

-import java.net.URL
 import scala.util.Using
+import scala.jdk.CollectionConverters._

 object StanfordNerClassifier {
  private[this] val logger = getLogger
--- a/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala
@@ -1,4 +1,4 @@
-package docspell.text.split
+package docspell.analysis.split

 import fs2.Stream

--- a/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala
@@ -1,4 +1,4 @@
-package docspell.text.split
+package docspell.analysis.split

 case class Word(value: String, begin: Int, end: Int) {
  def isEmpty: Boolean  = value.isEmpty
--- a/modules/analysis/src/test/scala/docspell/analysis/TestFiles.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/TestFiles.scala
@@ -0,0 +1,21 @@
+package docspell.analysis
+
+import cats.effect.{Blocker, IO}
+import docspell.files._
+
+import scala.concurrent.ExecutionContext
+
+object TestFiles {
+  val blocker     = Blocker.liftExecutionContext(ExecutionContext.global)
+  implicit val CS = IO.contextShift(ExecutionContext.global)
+
+  lazy val letterDEText =
+    ExampleFiles.letter_de_txt
+      .readText[IO](16 * 1024, blocker)
+      .unsafeRunSync
+
+  lazy val letterENText =
+    ExampleFiles.letter_en_txt
+      .readText[IO](16 * 1024, blocker)
+      .unsafeRunSync
+}
--- a/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala
@@ -1,4 +1,4 @@
-package docspell.text.contact
+package docspell.analysis.contact

 import docspell.common.{NerLabel, NerTag}
 import minitest.SimpleTestSuite
--- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala
@@ -1,8 +1,8 @@
-package docspell.text.date
+package docspell.analysis.date

+import docspell.analysis.TestFiles
+import minitest.SimpleTestSuite
 import docspell.common.Language
-import docspell.text.TestFiles
-import minitest._

 object DateFindSpec extends SimpleTestSuite {

--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@@ -1,8 +1,8 @@
-package docspell.text.nlp
+package docspell.analysis.nlp

-import docspell.common.{Language, NerLabel, NerTag}
-import docspell.text.TestFiles
 import minitest.SimpleTestSuite
+import docspell.analysis.TestFiles
+import docspell.common._

 object TextAnalyserSuite extends SimpleTestSuite {

--- a/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala
@@ -1,6 +1,6 @@
-package docspell.text.split
+package docspell.analysis.split

-import minitest._
+import minitest.SimpleTestSuite

 object TestSplitterSpec extends SimpleTestSuite {

--- a/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala
@@ -1,4 +1,4 @@
-package docspell.text.ocr
+package docspell.extract.ocr

 import java.nio.file.{Path, Paths}

--- a/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala
@@ -1,4 +1,4 @@
-package docspell.text.ocr
+package docspell.extract.ocr

 import java.nio.file.Path

@@ -61,7 +61,7 @@ object Ocr {
  /** Run ghostscript to extract all pdf pages into tiff files. The
    * files are stored to a temporary location on disk and returned.
    */
-  private[text] def runGhostscript[F[_]: Sync: ContextShift](
+  private[extract] def runGhostscript[F[_]: Sync: ContextShift](
      pdf: Stream[F, Byte],
      cfg: Config,
      wd: Path,
@@ -92,7 +92,7 @@ object Ocr {
  /** Run ghostscript to extract all pdf pages into tiff files. The
    * files are stored to a temporary location on disk and returned.
    */
-  private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
+  private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
      pdf: Path,
      ghostscript: SystemCommand.Config,
      wd: Path,
@@ -120,7 +120,7 @@ object Ocr {
  /** Run unpaper to optimize the image for ocr. The
    * files are stored to a temporary location on disk and returned.
    */
-  private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
+  private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
      img: Path,
      unpaper: SystemCommand.Config,
      wd: Path,
@@ -146,7 +146,7 @@ object Ocr {
  /** Run tesseract on the given image file and return the extracted
    * text.
    */
-  private[text] def runTesseractFile[F[_]: Sync: ContextShift](
+  private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
      img: Path,
      blocker: Blocker,
      lang: String,
@@ -164,7 +164,7 @@ object Ocr {
  /** Run tesseract on the given image file and return the extracted
    * text.
    */
-  private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
+  private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
      img: Stream[F, Byte],
      blocker: Blocker,
      lang: String,
--- a/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
+++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala
@@ -1,4 +1,4 @@
-package docspell.text.ocr
+package docspell.extract.ocr

 import cats.effect.{Blocker, ContextShift, Sync}
 import docspell.common._
--- a/modules/extract/src/test/resources/logback.xml
+++ b/modules/extract/src/test/resources/logback.xml
--- a/modules/extract/src/test/scala/docspell/extract/TestFiles.scala
+++ b/modules/extract/src/test/scala/docspell/extract/TestFiles.scala
@@ -1,4 +1,4 @@
-package docspell.text
+package docspell.extract

 import fs2.Stream
 import cats.effect.{Blocker, IO}
--- a/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
+++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala
@@ -1,9 +1,9 @@
-package docspell.text.ocr
+package docspell.extract.ocr

 import cats.effect.IO
 import docspell.common._
 import docspell.files._
-import docspell.text.TestFiles
+import docspell.extract.TestFiles
 import minitest.SimpleTestSuite

 object TextExtractionSuite extends SimpleTestSuite {
--- a/modules/joex/src/main/scala/docspell/joex/Config.scala
+++ b/modules/joex/src/main/scala/docspell/joex/Config.scala
@@ -3,7 +3,7 @@ package docspell.joex
 import docspell.common.{Ident, LenientUri}
 import docspell.joex.scheduler.SchedulerConfig
 import docspell.store.JdbcConfig
-import docspell.text.ocr.{Config => OcrConfig}
+import docspell.extract.ocr.{Config => OcrConfig}
 import docspell.convert.ConvertConfig

 case class Config(
--- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala
@@ -5,11 +5,11 @@ import java.time.ZoneId
 import cats.{Applicative, FlatMap}
 import cats.implicits._
 import cats.effect.Sync
+import docspell.analysis.contact._
 import docspell.common.MetaProposal.Candidate
 import docspell.common._
 import docspell.joex.scheduler.{Context, Task}
-import docspell.store.records.{RAttachmentMeta, REquipment, ROrganization, RPerson}
-import docspell.text.contact.Domain
+import docspell.store.records._

 /** Super simple approach to find corresponding meta data to an item
  * by looking up values from NER in the users address book.
--- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala
@@ -2,13 +2,13 @@ package docspell.joex.process

 import cats.implicits._
 import cats.effect.Sync
-import docspell.common.{Duration, Language, NerLabel, ProcessItemArgs}
+import docspell.analysis.nlp._
+import docspell.analysis.contact._
+import docspell.analysis.date._
+import docspell.common._
 import docspell.joex.process.ItemData.AttachmentDates
 import docspell.joex.scheduler.Task
 import docspell.store.records.RAttachmentMeta
-import docspell.text.contact.Contact
-import docspell.text.date.DateFind
-import docspell.text.nlp.StanfordNerClassifier

 object TextAnalysis {

--- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala
@@ -7,7 +7,7 @@ import docspell.common._
 import docspell.joex.scheduler.{Context, Task}
 import docspell.store.Store
 import docspell.store.records.{RAttachment, RAttachmentMeta}
-import docspell.text.ocr.{TextExtract, Config => OcrConfig}
+import docspell.extract.ocr.{TextExtract, Config => OcrConfig}

 object TextExtraction {