diff --git a/build.sbt b/build.sbt index 1313de8d..afe1e09c 100644 --- a/build.sbt +++ b/build.sbt @@ -145,14 +145,14 @@ val common = project.in(file("modules/common")). // Some example files for testing // https://file-examples.com/index.php/sample-documents-download/sample-doc-download/ -val exampleFiles = project.in(file("modules/files")). +val files = project.in(file("modules/files")). disablePlugins(RevolverPlugin). settings(sharedSettings). settings(testSettings). settings( name := "docspell-files", libraryDependencies ++= - Dependencies.tika, + Dependencies.tika , Test / sourceGenerators += Def.task { val base = (Test/resourceDirectory).value val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base) @@ -196,18 +196,17 @@ val store = project.in(file("modules/store")). Dependencies.emil ).dependsOn(common) -val text = project.in(file("modules/text")). +val extract = project.in(file("modules/extract")). disablePlugins(RevolverPlugin). - enablePlugins(NerModelsPlugin). settings(sharedSettings). settings(testSettings). - settings(NerModelsPlugin.nerClassifierSettings). settings( - name := "docspell-text", + name := "docspell-extract", libraryDependencies ++= Dependencies.fs2 ++ - Dependencies.stanfordNlpCore - ).dependsOn(common, exampleFiles % "compile->compile;test->test") + Dependencies.pdfbox ++ + Dependencies.poi + ).dependsOn(common, files % "compile->compile;test->test") val convert = project.in(file("modules/convert")). disablePlugins(RevolverPlugin). @@ -216,9 +215,21 @@ val convert = project.in(file("modules/convert")). settings( name := "docspell-convert", libraryDependencies ++= - Dependencies.pdfbox ++ Dependencies.flexmark - ).dependsOn(common, exampleFiles % "compile->compile;test->test") + ).dependsOn(common, files % "compile->compile;test->test") + +val analysis = project.in(file("modules/analysis")). + disablePlugins(RevolverPlugin). + enablePlugins(NerModelsPlugin). + settings(sharedSettings). + settings(testSettings). + settings(NerModelsPlugin.nerClassifierSettings). + settings( + name := "docspell-analysis", + libraryDependencies ++= + Dependencies.fs2 ++ + Dependencies.stanfordNlpCore + ).dependsOn(common, files % "test->test") val restapi = project.in(file("modules/restapi")). disablePlugins(RevolverPlugin). @@ -272,7 +283,7 @@ val joex = project.in(file("modules/joex")). addCompilerPlugin(Dependencies.betterMonadicFor), buildInfoPackage := "docspell.joex", reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}") - ).dependsOn(store, text, convert, joexapi, restapi) + ).dependsOn(store, extract, convert, analysis, joexapi, restapi) val backend = project.in(file("modules/backend")). disablePlugins(RevolverPlugin). @@ -402,9 +413,10 @@ val root = project.in(file(".")). name := "docspell-root" ). aggregate(common - , text + , extract , convert - , exampleFiles + , analysis + , files , store , joexapi , joex diff --git a/modules/text/src/main/scala/docspell/text/contact/Contact.scala b/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala similarity index 92% rename from modules/text/src/main/scala/docspell/text/contact/Contact.scala rename to modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala index f1e5b480..bd3c5823 100644 --- a/modules/text/src/main/scala/docspell/text/contact/Contact.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala @@ -1,9 +1,10 @@ -package docspell.text.contact +package docspell.analysis.contact import fs2.Stream import cats.implicits._ -import docspell.common.{Ident, LenientUri, NerLabel, NerTag} -import docspell.text.split.TextSplitter + +import docspell.common._ +import docspell.analysis.split._ object Contact { private[this] val protocols = Set("ftp", "http", "https") diff --git a/modules/text/src/main/scala/docspell/text/contact/Domain.scala b/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala similarity index 97% rename from modules/text/src/main/scala/docspell/text/contact/Domain.scala rename to modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala index b9973392..3fc16ebb 100644 --- a/modules/text/src/main/scala/docspell/text/contact/Domain.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala @@ -1,4 +1,4 @@ -package docspell.text.contact +package docspell.analysis.contact import cats.data.NonEmptyList import docspell.common.LenientUri diff --git a/modules/text/src/main/scala/docspell/text/contact/Tld.scala b/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala similarity index 93% rename from modules/text/src/main/scala/docspell/text/contact/Tld.scala rename to modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala index af7cae07..3f8ba6ae 100644 --- a/modules/text/src/main/scala/docspell/text/contact/Tld.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala @@ -1,6 +1,6 @@ -package docspell.text.contact +package docspell.analysis.contact -private[text] object Tld { +private[analysis] object Tld { def findTld(str: String): Option[String] = known.find(str.endsWith) diff --git a/modules/text/src/main/scala/docspell/text/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala similarity index 93% rename from modules/text/src/main/scala/docspell/text/date/DateFind.scala rename to modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 79f956ec..eb21f0a8 100644 --- a/modules/text/src/main/scala/docspell/text/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -1,10 +1,10 @@ -package docspell.text.date +package docspell.analysis.date -import fs2._ import java.time.LocalDate -import docspell.common.{Language, NerDateLabel, NerLabel, NerTag} -import docspell.text.split.{TextSplitter, Word} +import fs2.{Pure, Stream} +import docspell.common._ +import docspell.analysis.split._ import scala.util.Try @@ -21,7 +21,7 @@ object DateFind { .map(sd => NerDateLabel( sd.toLocalDate, - NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end) + NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end) ) ) ) diff --git a/modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala similarity index 95% rename from modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala rename to modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala index 084d9dc4..e8a3329d 100644 --- a/modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala @@ -1,17 +1,17 @@ -package docspell.text.nlp +package docspell.analysis.nlp +import java.net.URL import java.util.zip.GZIPInputStream -import docspell.common.{Language, NerLabel, NerTag} import edu.stanford.nlp.ie.AbstractSequenceClassifier import edu.stanford.nlp.ie.crf.CRFClassifier import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} +import org.log4s.getLogger -import scala.jdk.CollectionConverters._ -import org.log4s._ +import docspell.common._ -import java.net.URL import scala.util.Using +import scala.jdk.CollectionConverters._ object StanfordNerClassifier { private[this] val logger = getLogger diff --git a/modules/text/src/main/scala/docspell/text/split/TextSplitter.scala b/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala similarity index 96% rename from modules/text/src/main/scala/docspell/text/split/TextSplitter.scala rename to modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala index 7a7cd292..d4892dac 100644 --- a/modules/text/src/main/scala/docspell/text/split/TextSplitter.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala @@ -1,4 +1,4 @@ -package docspell.text.split +package docspell.analysis.split import fs2.Stream diff --git a/modules/text/src/main/scala/docspell/text/split/Word.scala b/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala similarity index 96% rename from modules/text/src/main/scala/docspell/text/split/Word.scala rename to modules/analysis/src/main/scala/docspell/analysis/split/Word.scala index 88f0c3c0..f68f0af7 100644 --- a/modules/text/src/main/scala/docspell/text/split/Word.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala @@ -1,4 +1,4 @@ -package docspell.text.split +package docspell.analysis.split case class Word(value: String, begin: Int, end: Int) { def isEmpty: Boolean = value.isEmpty diff --git a/modules/analysis/src/test/scala/docspell/analysis/TestFiles.scala b/modules/analysis/src/test/scala/docspell/analysis/TestFiles.scala new file mode 100644 index 00000000..c01d6ad1 --- /dev/null +++ b/modules/analysis/src/test/scala/docspell/analysis/TestFiles.scala @@ -0,0 +1,21 @@ +package docspell.analysis + +import cats.effect.{Blocker, IO} +import docspell.files._ + +import scala.concurrent.ExecutionContext + +object TestFiles { + val blocker = Blocker.liftExecutionContext(ExecutionContext.global) + implicit val CS = IO.contextShift(ExecutionContext.global) + + lazy val letterDEText = + ExampleFiles.letter_de_txt + .readText[IO](16 * 1024, blocker) + .unsafeRunSync + + lazy val letterENText = + ExampleFiles.letter_en_txt + .readText[IO](16 * 1024, blocker) + .unsafeRunSync +} diff --git a/modules/text/src/test/scala/docspell/text/contact/ContactAnnotateSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala similarity index 97% rename from modules/text/src/test/scala/docspell/text/contact/ContactAnnotateSpec.scala rename to modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala index 721d2c35..aaff9203 100644 --- a/modules/text/src/test/scala/docspell/text/contact/ContactAnnotateSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala @@ -1,4 +1,4 @@ -package docspell.text.contact +package docspell.analysis.contact import docspell.common.{NerLabel, NerTag} import minitest.SimpleTestSuite diff --git a/modules/text/src/test/scala/docspell/text/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala similarity index 74% rename from modules/text/src/test/scala/docspell/text/date/DateFindSpec.scala rename to modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index a974fa22..30f0b5bc 100644 --- a/modules/text/src/test/scala/docspell/text/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -1,8 +1,8 @@ -package docspell.text.date +package docspell.analysis.date +import docspell.analysis.TestFiles +import minitest.SimpleTestSuite import docspell.common.Language -import docspell.text.TestFiles -import minitest._ object DateFindSpec extends SimpleTestSuite { diff --git a/modules/text/src/test/scala/docspell/text/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala similarity index 95% rename from modules/text/src/test/scala/docspell/text/nlp/TextAnalyserSuite.scala rename to modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala index 309c241b..7c0f150d 100644 --- a/modules/text/src/test/scala/docspell/text/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala @@ -1,8 +1,8 @@ -package docspell.text.nlp +package docspell.analysis.nlp -import docspell.common.{Language, NerLabel, NerTag} -import docspell.text.TestFiles import minitest.SimpleTestSuite +import docspell.analysis.TestFiles +import docspell.common._ object TextAnalyserSuite extends SimpleTestSuite { diff --git a/modules/text/src/test/scala/docspell/text/split/TestSplitterSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala similarity index 91% rename from modules/text/src/test/scala/docspell/text/split/TestSplitterSpec.scala rename to modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala index 13e91a5d..4c17741d 100644 --- a/modules/text/src/test/scala/docspell/text/split/TestSplitterSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala @@ -1,6 +1,6 @@ -package docspell.text.split +package docspell.analysis.split -import minitest._ +import minitest.SimpleTestSuite object TestSplitterSpec extends SimpleTestSuite { diff --git a/modules/text/src/main/scala/docspell/text/ocr/Config.scala b/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala similarity index 97% rename from modules/text/src/main/scala/docspell/text/ocr/Config.scala rename to modules/extract/src/main/scala/docspell/extract/ocr/Config.scala index 5da49ed0..482c0e91 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/Config.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/Config.scala @@ -1,4 +1,4 @@ -package docspell.text.ocr +package docspell.extract.ocr import java.nio.file.{Path, Paths} diff --git a/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala similarity index 93% rename from modules/text/src/main/scala/docspell/text/ocr/Ocr.scala rename to modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala index 7cda9bdf..5cefcbc1 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala @@ -1,4 +1,4 @@ -package docspell.text.ocr +package docspell.extract.ocr import java.nio.file.Path @@ -61,7 +61,7 @@ object Ocr { /** Run ghostscript to extract all pdf pages into tiff files. The * files are stored to a temporary location on disk and returned. */ - private[text] def runGhostscript[F[_]: Sync: ContextShift]( + private[extract] def runGhostscript[F[_]: Sync: ContextShift]( pdf: Stream[F, Byte], cfg: Config, wd: Path, @@ -92,7 +92,7 @@ object Ocr { /** Run ghostscript to extract all pdf pages into tiff files. The * files are stored to a temporary location on disk and returned. */ - private[text] def runGhostscriptFile[F[_]: Sync: ContextShift]( + private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift]( pdf: Path, ghostscript: SystemCommand.Config, wd: Path, @@ -120,7 +120,7 @@ object Ocr { /** Run unpaper to optimize the image for ocr. The * files are stored to a temporary location on disk and returned. */ - private[text] def runUnpaperFile[F[_]: Sync: ContextShift]( + private[extract] def runUnpaperFile[F[_]: Sync: ContextShift]( img: Path, unpaper: SystemCommand.Config, wd: Path, @@ -146,7 +146,7 @@ object Ocr { /** Run tesseract on the given image file and return the extracted * text. */ - private[text] def runTesseractFile[F[_]: Sync: ContextShift]( + private[extract] def runTesseractFile[F[_]: Sync: ContextShift]( img: Path, blocker: Blocker, lang: String, @@ -164,7 +164,7 @@ object Ocr { /** Run tesseract on the given image file and return the extracted * text. */ - private[text] def runTesseractStdin[F[_]: Sync: ContextShift]( + private[extract] def runTesseractStdin[F[_]: Sync: ContextShift]( img: Stream[F, Byte], blocker: Blocker, lang: String, diff --git a/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala similarity index 97% rename from modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala rename to modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala index dc43e524..51a7ca73 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala @@ -1,4 +1,4 @@ -package docspell.text.ocr +package docspell.extract.ocr import cats.effect.{Blocker, ContextShift, Sync} import docspell.common._ diff --git a/modules/text/src/test/resources/logback.xml b/modules/extract/src/test/resources/logback.xml similarity index 100% rename from modules/text/src/test/resources/logback.xml rename to modules/extract/src/test/resources/logback.xml diff --git a/modules/text/src/test/scala/docspell/text/TestFiles.scala b/modules/extract/src/test/scala/docspell/extract/TestFiles.scala similarity index 96% rename from modules/text/src/test/scala/docspell/text/TestFiles.scala rename to modules/extract/src/test/scala/docspell/extract/TestFiles.scala index 4db8966b..9c5637e3 100644 --- a/modules/text/src/test/scala/docspell/text/TestFiles.scala +++ b/modules/extract/src/test/scala/docspell/extract/TestFiles.scala @@ -1,4 +1,4 @@ -package docspell.text +package docspell.extract import fs2.Stream import cats.effect.{Blocker, IO} diff --git a/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala similarity index 93% rename from modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala rename to modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala index 01781fcc..0f400a13 100644 --- a/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala +++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala @@ -1,9 +1,9 @@ -package docspell.text.ocr +package docspell.extract.ocr import cats.effect.IO import docspell.common._ import docspell.files._ -import docspell.text.TestFiles +import docspell.extract.TestFiles import minitest.SimpleTestSuite object TextExtractionSuite extends SimpleTestSuite { diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index 4cdd9391..a5a9bc47 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -3,7 +3,7 @@ package docspell.joex import docspell.common.{Ident, LenientUri} import docspell.joex.scheduler.SchedulerConfig import docspell.store.JdbcConfig -import docspell.text.ocr.{Config => OcrConfig} +import docspell.extract.ocr.{Config => OcrConfig} import docspell.convert.ConvertConfig case class Config( diff --git a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala index c8b57365..c9aff410 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala @@ -5,11 +5,11 @@ import java.time.ZoneId import cats.{Applicative, FlatMap} import cats.implicits._ import cats.effect.Sync +import docspell.analysis.contact._ import docspell.common.MetaProposal.Candidate import docspell.common._ import docspell.joex.scheduler.{Context, Task} -import docspell.store.records.{RAttachmentMeta, REquipment, ROrganization, RPerson} -import docspell.text.contact.Domain +import docspell.store.records._ /** Super simple approach to find corresponding meta data to an item * by looking up values from NER in the users address book. diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 7e4cc13b..ddc3f0c8 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -2,13 +2,13 @@ package docspell.joex.process import cats.implicits._ import cats.effect.Sync -import docspell.common.{Duration, Language, NerLabel, ProcessItemArgs} +import docspell.analysis.nlp._ +import docspell.analysis.contact._ +import docspell.analysis.date._ +import docspell.common._ import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Task import docspell.store.records.RAttachmentMeta -import docspell.text.contact.Contact -import docspell.text.date.DateFind -import docspell.text.nlp.StanfordNerClassifier object TextAnalysis { diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 478f6a91..fa81774e 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -7,7 +7,7 @@ import docspell.common._ import docspell.joex.scheduler.{Context, Task} import docspell.store.Store import docspell.store.records.{RAttachment, RAttachmentMeta} -import docspell.text.ocr.{TextExtract, Config => OcrConfig} +import docspell.extract.ocr.{TextExtract, Config => OcrConfig} object TextExtraction { diff --git a/project/Dependencies.scala b/project/Dependencies.scala index ed3697d4..c94f29d5 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -25,6 +25,7 @@ object Dependencies { val PoiVersion = "4.1.1" val PostgresVersion = "42.2.10" val PureConfigVersion = "0.12.2" + val Slf4jVersion = "1.7.30" val SqliteVersion = "3.30.1" val StanfordNlpVersion = "3.9.2" val TikaVersion = "1.23" @@ -38,6 +39,8 @@ object Dependencies { val poi = Seq( "org.apache.poi" % "poi" % PoiVersion, "org.apache.poi" % "poi-ooxml" % PoiVersion, + "org.slf4j" % "slf4j-log4j12" % Slf4jVersion, + "org.slf4j" % "slf4j-jcl" % Slf4jVersion ).map(_.excludeAll( ExclusionRule("commons-logging"), ExclusionRule("log4j") @@ -54,11 +57,17 @@ object Dependencies { ExclusionRule("hamcrest-core") )) + val twelvemonkeys = Seq( + "com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5", + "com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5" + ) + val pdfbox = Seq( "org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll( ExclusionRule("commons-logging"), ExclusionRule("org.bouncycastle") - ) + ), + "org.slf4j" % "slf4j-jcl" % Slf4jVersion ) val emil = Seq(