mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-03-28 17:55:06 +00:00
Reorganize processing code
Use separate modules for - text extraction - conversion to pdf - text analysis
This commit is contained in:
parent
919381be1e
commit
851ee7ef0f
38
build.sbt
38
build.sbt
@ -145,14 +145,14 @@ val common = project.in(file("modules/common")).
|
||||
|
||||
// Some example files for testing
|
||||
// https://file-examples.com/index.php/sample-documents-download/sample-doc-download/
|
||||
val exampleFiles = project.in(file("modules/files")).
|
||||
val files = project.in(file("modules/files")).
|
||||
disablePlugins(RevolverPlugin).
|
||||
settings(sharedSettings).
|
||||
settings(testSettings).
|
||||
settings(
|
||||
name := "docspell-files",
|
||||
libraryDependencies ++=
|
||||
Dependencies.tika,
|
||||
Dependencies.tika ,
|
||||
Test / sourceGenerators += Def.task {
|
||||
val base = (Test/resourceDirectory).value
|
||||
val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
|
||||
@ -196,18 +196,17 @@ val store = project.in(file("modules/store")).
|
||||
Dependencies.emil
|
||||
).dependsOn(common)
|
||||
|
||||
val text = project.in(file("modules/text")).
|
||||
val extract = project.in(file("modules/extract")).
|
||||
disablePlugins(RevolverPlugin).
|
||||
enablePlugins(NerModelsPlugin).
|
||||
settings(sharedSettings).
|
||||
settings(testSettings).
|
||||
settings(NerModelsPlugin.nerClassifierSettings).
|
||||
settings(
|
||||
name := "docspell-text",
|
||||
name := "docspell-extract",
|
||||
libraryDependencies ++=
|
||||
Dependencies.fs2 ++
|
||||
Dependencies.stanfordNlpCore
|
||||
).dependsOn(common, exampleFiles % "compile->compile;test->test")
|
||||
Dependencies.pdfbox ++
|
||||
Dependencies.poi
|
||||
).dependsOn(common, files % "compile->compile;test->test")
|
||||
|
||||
val convert = project.in(file("modules/convert")).
|
||||
disablePlugins(RevolverPlugin).
|
||||
@ -216,9 +215,21 @@ val convert = project.in(file("modules/convert")).
|
||||
settings(
|
||||
name := "docspell-convert",
|
||||
libraryDependencies ++=
|
||||
Dependencies.pdfbox ++
|
||||
Dependencies.flexmark
|
||||
).dependsOn(common, exampleFiles % "compile->compile;test->test")
|
||||
).dependsOn(common, files % "compile->compile;test->test")
|
||||
|
||||
val analysis = project.in(file("modules/analysis")).
|
||||
disablePlugins(RevolverPlugin).
|
||||
enablePlugins(NerModelsPlugin).
|
||||
settings(sharedSettings).
|
||||
settings(testSettings).
|
||||
settings(NerModelsPlugin.nerClassifierSettings).
|
||||
settings(
|
||||
name := "docspell-analysis",
|
||||
libraryDependencies ++=
|
||||
Dependencies.fs2 ++
|
||||
Dependencies.stanfordNlpCore
|
||||
).dependsOn(common, files % "test->test")
|
||||
|
||||
val restapi = project.in(file("modules/restapi")).
|
||||
disablePlugins(RevolverPlugin).
|
||||
@ -272,7 +283,7 @@ val joex = project.in(file("modules/joex")).
|
||||
addCompilerPlugin(Dependencies.betterMonadicFor),
|
||||
buildInfoPackage := "docspell.joex",
|
||||
reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}")
|
||||
).dependsOn(store, text, convert, joexapi, restapi)
|
||||
).dependsOn(store, extract, convert, analysis, joexapi, restapi)
|
||||
|
||||
val backend = project.in(file("modules/backend")).
|
||||
disablePlugins(RevolverPlugin).
|
||||
@ -402,9 +413,10 @@ val root = project.in(file(".")).
|
||||
name := "docspell-root"
|
||||
).
|
||||
aggregate(common
|
||||
, text
|
||||
, extract
|
||||
, convert
|
||||
, exampleFiles
|
||||
, analysis
|
||||
, files
|
||||
, store
|
||||
, joexapi
|
||||
, joex
|
||||
|
@ -1,9 +1,10 @@
|
||||
package docspell.text.contact
|
||||
package docspell.analysis.contact
|
||||
|
||||
import fs2.Stream
|
||||
import cats.implicits._
|
||||
import docspell.common.{Ident, LenientUri, NerLabel, NerTag}
|
||||
import docspell.text.split.TextSplitter
|
||||
|
||||
import docspell.common._
|
||||
import docspell.analysis.split._
|
||||
|
||||
object Contact {
|
||||
private[this] val protocols = Set("ftp", "http", "https")
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.contact
|
||||
package docspell.analysis.contact
|
||||
|
||||
import cats.data.NonEmptyList
|
||||
import docspell.common.LenientUri
|
@ -1,6 +1,6 @@
|
||||
package docspell.text.contact
|
||||
package docspell.analysis.contact
|
||||
|
||||
private[text] object Tld {
|
||||
private[analysis] object Tld {
|
||||
|
||||
def findTld(str: String): Option[String] =
|
||||
known.find(str.endsWith)
|
@ -1,10 +1,10 @@
|
||||
package docspell.text.date
|
||||
package docspell.analysis.date
|
||||
|
||||
import fs2._
|
||||
import java.time.LocalDate
|
||||
|
||||
import docspell.common.{Language, NerDateLabel, NerLabel, NerTag}
|
||||
import docspell.text.split.{TextSplitter, Word}
|
||||
import fs2.{Pure, Stream}
|
||||
import docspell.common._
|
||||
import docspell.analysis.split._
|
||||
|
||||
import scala.util.Try
|
||||
|
||||
@ -21,7 +21,7 @@ object DateFind {
|
||||
.map(sd =>
|
||||
NerDateLabel(
|
||||
sd.toLocalDate,
|
||||
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)
|
||||
NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end)
|
||||
)
|
||||
)
|
||||
)
|
@ -1,17 +1,17 @@
|
||||
package docspell.text.nlp
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.net.URL
|
||||
import java.util.zip.GZIPInputStream
|
||||
|
||||
import docspell.common.{Language, NerLabel, NerTag}
|
||||
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
||||
import edu.stanford.nlp.ie.crf.CRFClassifier
|
||||
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
|
||||
import org.log4s.getLogger
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
import org.log4s._
|
||||
import docspell.common._
|
||||
|
||||
import java.net.URL
|
||||
import scala.util.Using
|
||||
import scala.jdk.CollectionConverters._
|
||||
|
||||
object StanfordNerClassifier {
|
||||
private[this] val logger = getLogger
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.split
|
||||
package docspell.analysis.split
|
||||
|
||||
import fs2.Stream
|
||||
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.split
|
||||
package docspell.analysis.split
|
||||
|
||||
case class Word(value: String, begin: Int, end: Int) {
|
||||
def isEmpty: Boolean = value.isEmpty
|
@ -0,0 +1,21 @@
|
||||
package docspell.analysis
|
||||
|
||||
import cats.effect.{Blocker, IO}
|
||||
import docspell.files._
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
object TestFiles {
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
lazy val letterDEText =
|
||||
ExampleFiles.letter_de_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
|
||||
lazy val letterENText =
|
||||
ExampleFiles.letter_en_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.contact
|
||||
package docspell.analysis.contact
|
||||
|
||||
import docspell.common.{NerLabel, NerTag}
|
||||
import minitest.SimpleTestSuite
|
@ -1,8 +1,8 @@
|
||||
package docspell.text.date
|
||||
package docspell.analysis.date
|
||||
|
||||
import docspell.analysis.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.common.Language
|
||||
import docspell.text.TestFiles
|
||||
import minitest._
|
||||
|
||||
object DateFindSpec extends SimpleTestSuite {
|
||||
|
@ -1,8 +1,8 @@
|
||||
package docspell.text.nlp
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import docspell.common.{Language, NerLabel, NerTag}
|
||||
import docspell.text.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.analysis.TestFiles
|
||||
import docspell.common._
|
||||
|
||||
object TextAnalyserSuite extends SimpleTestSuite {
|
||||
|
@ -1,6 +1,6 @@
|
||||
package docspell.text.split
|
||||
package docspell.analysis.split
|
||||
|
||||
import minitest._
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TestSplitterSpec extends SimpleTestSuite {
|
||||
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.ocr
|
||||
package docspell.extract.ocr
|
||||
|
||||
import java.nio.file.{Path, Paths}
|
||||
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.ocr
|
||||
package docspell.extract.ocr
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
@ -61,7 +61,7 @@ object Ocr {
|
||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[text] def runGhostscript[F[_]: Sync: ContextShift](
|
||||
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
|
||||
pdf: Stream[F, Byte],
|
||||
cfg: Config,
|
||||
wd: Path,
|
||||
@ -92,7 +92,7 @@ object Ocr {
|
||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
|
||||
private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
|
||||
pdf: Path,
|
||||
ghostscript: SystemCommand.Config,
|
||||
wd: Path,
|
||||
@ -120,7 +120,7 @@ object Ocr {
|
||||
/** Run unpaper to optimize the image for ocr. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
|
||||
private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
|
||||
img: Path,
|
||||
unpaper: SystemCommand.Config,
|
||||
wd: Path,
|
||||
@ -146,7 +146,7 @@ object Ocr {
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
* text.
|
||||
*/
|
||||
private[text] def runTesseractFile[F[_]: Sync: ContextShift](
|
||||
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
|
||||
img: Path,
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
@ -164,7 +164,7 @@ object Ocr {
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
* text.
|
||||
*/
|
||||
private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
|
||||
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
|
||||
img: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: String,
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.ocr
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.{Blocker, ContextShift, Sync}
|
||||
import docspell.common._
|
@ -1,4 +1,4 @@
|
||||
package docspell.text
|
||||
package docspell.extract
|
||||
|
||||
import fs2.Stream
|
||||
import cats.effect.{Blocker, IO}
|
@ -1,9 +1,9 @@
|
||||
package docspell.text.ocr
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.IO
|
||||
import docspell.common._
|
||||
import docspell.files._
|
||||
import docspell.text.TestFiles
|
||||
import docspell.extract.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TextExtractionSuite extends SimpleTestSuite {
|
@ -3,7 +3,7 @@ package docspell.joex
|
||||
import docspell.common.{Ident, LenientUri}
|
||||
import docspell.joex.scheduler.SchedulerConfig
|
||||
import docspell.store.JdbcConfig
|
||||
import docspell.text.ocr.{Config => OcrConfig}
|
||||
import docspell.extract.ocr.{Config => OcrConfig}
|
||||
import docspell.convert.ConvertConfig
|
||||
|
||||
case class Config(
|
||||
|
@ -5,11 +5,11 @@ import java.time.ZoneId
|
||||
import cats.{Applicative, FlatMap}
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.analysis.contact._
|
||||
import docspell.common.MetaProposal.Candidate
|
||||
import docspell.common._
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.records.{RAttachmentMeta, REquipment, ROrganization, RPerson}
|
||||
import docspell.text.contact.Domain
|
||||
import docspell.store.records._
|
||||
|
||||
/** Super simple approach to find corresponding meta data to an item
|
||||
* by looking up values from NER in the users address book.
|
||||
|
@ -2,13 +2,13 @@ package docspell.joex.process
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common.{Duration, Language, NerLabel, ProcessItemArgs}
|
||||
import docspell.analysis.nlp._
|
||||
import docspell.analysis.contact._
|
||||
import docspell.analysis.date._
|
||||
import docspell.common._
|
||||
import docspell.joex.process.ItemData.AttachmentDates
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.records.RAttachmentMeta
|
||||
import docspell.text.contact.Contact
|
||||
import docspell.text.date.DateFind
|
||||
import docspell.text.nlp.StanfordNerClassifier
|
||||
|
||||
object TextAnalysis {
|
||||
|
||||
|
@ -7,7 +7,7 @@ import docspell.common._
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.Store
|
||||
import docspell.store.records.{RAttachment, RAttachmentMeta}
|
||||
import docspell.text.ocr.{TextExtract, Config => OcrConfig}
|
||||
import docspell.extract.ocr.{TextExtract, Config => OcrConfig}
|
||||
|
||||
object TextExtraction {
|
||||
|
||||
|
@ -25,6 +25,7 @@ object Dependencies {
|
||||
val PoiVersion = "4.1.1"
|
||||
val PostgresVersion = "42.2.10"
|
||||
val PureConfigVersion = "0.12.2"
|
||||
val Slf4jVersion = "1.7.30"
|
||||
val SqliteVersion = "3.30.1"
|
||||
val StanfordNlpVersion = "3.9.2"
|
||||
val TikaVersion = "1.23"
|
||||
@ -38,6 +39,8 @@ object Dependencies {
|
||||
val poi = Seq(
|
||||
"org.apache.poi" % "poi" % PoiVersion,
|
||||
"org.apache.poi" % "poi-ooxml" % PoiVersion,
|
||||
"org.slf4j" % "slf4j-log4j12" % Slf4jVersion,
|
||||
"org.slf4j" % "slf4j-jcl" % Slf4jVersion
|
||||
).map(_.excludeAll(
|
||||
ExclusionRule("commons-logging"),
|
||||
ExclusionRule("log4j")
|
||||
@ -54,11 +57,17 @@ object Dependencies {
|
||||
ExclusionRule("hamcrest-core")
|
||||
))
|
||||
|
||||
val twelvemonkeys = Seq(
|
||||
"com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
|
||||
"com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
|
||||
)
|
||||
|
||||
val pdfbox = Seq(
|
||||
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll(
|
||||
ExclusionRule("commons-logging"),
|
||||
ExclusionRule("org.bouncycastle")
|
||||
)
|
||||
),
|
||||
"org.slf4j" % "slf4j-jcl" % Slf4jVersion
|
||||
)
|
||||
|
||||
val emil = Seq(
|
||||
|
Loading…
x
Reference in New Issue
Block a user