Reorganize processing code

Use separate modules for

- text extraction
- conversion to pdf
- text analysis
This commit is contained in:
Eike Kettner 2020-02-15 16:40:50 +01:00
parent 919381be1e
commit 851ee7ef0f
24 changed files with 103 additions and 60 deletions

View File

@ -145,14 +145,14 @@ val common = project.in(file("modules/common")).
// Some example files for testing
// https://file-examples.com/index.php/sample-documents-download/sample-doc-download/
val exampleFiles = project.in(file("modules/files")).
val files = project.in(file("modules/files")).
disablePlugins(RevolverPlugin).
settings(sharedSettings).
settings(testSettings).
settings(
name := "docspell-files",
libraryDependencies ++=
Dependencies.tika,
Dependencies.tika ,
Test / sourceGenerators += Def.task {
val base = (Test/resourceDirectory).value
val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base)
@ -196,18 +196,17 @@ val store = project.in(file("modules/store")).
Dependencies.emil
).dependsOn(common)
val text = project.in(file("modules/text")).
val extract = project.in(file("modules/extract")).
disablePlugins(RevolverPlugin).
enablePlugins(NerModelsPlugin).
settings(sharedSettings).
settings(testSettings).
settings(NerModelsPlugin.nerClassifierSettings).
settings(
name := "docspell-text",
name := "docspell-extract",
libraryDependencies ++=
Dependencies.fs2 ++
Dependencies.stanfordNlpCore
).dependsOn(common, exampleFiles % "compile->compile;test->test")
Dependencies.pdfbox ++
Dependencies.poi
).dependsOn(common, files % "compile->compile;test->test")
val convert = project.in(file("modules/convert")).
disablePlugins(RevolverPlugin).
@ -216,9 +215,21 @@ val convert = project.in(file("modules/convert")).
settings(
name := "docspell-convert",
libraryDependencies ++=
Dependencies.pdfbox ++
Dependencies.flexmark
).dependsOn(common, exampleFiles % "compile->compile;test->test")
).dependsOn(common, files % "compile->compile;test->test")
val analysis = project.in(file("modules/analysis")).
disablePlugins(RevolverPlugin).
enablePlugins(NerModelsPlugin).
settings(sharedSettings).
settings(testSettings).
settings(NerModelsPlugin.nerClassifierSettings).
settings(
name := "docspell-analysis",
libraryDependencies ++=
Dependencies.fs2 ++
Dependencies.stanfordNlpCore
).dependsOn(common, files % "test->test")
val restapi = project.in(file("modules/restapi")).
disablePlugins(RevolverPlugin).
@ -272,7 +283,7 @@ val joex = project.in(file("modules/joex")).
addCompilerPlugin(Dependencies.betterMonadicFor),
buildInfoPackage := "docspell.joex",
reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}")
).dependsOn(store, text, convert, joexapi, restapi)
).dependsOn(store, extract, convert, analysis, joexapi, restapi)
val backend = project.in(file("modules/backend")).
disablePlugins(RevolverPlugin).
@ -402,9 +413,10 @@ val root = project.in(file(".")).
name := "docspell-root"
).
aggregate(common
, text
, extract
, convert
, exampleFiles
, analysis
, files
, store
, joexapi
, joex

View File

@ -1,9 +1,10 @@
package docspell.text.contact
package docspell.analysis.contact
import fs2.Stream
import cats.implicits._
import docspell.common.{Ident, LenientUri, NerLabel, NerTag}
import docspell.text.split.TextSplitter
import docspell.common._
import docspell.analysis.split._
object Contact {
private[this] val protocols = Set("ftp", "http", "https")

View File

@ -1,4 +1,4 @@
package docspell.text.contact
package docspell.analysis.contact
import cats.data.NonEmptyList
import docspell.common.LenientUri

View File

@ -1,6 +1,6 @@
package docspell.text.contact
package docspell.analysis.contact
private[text] object Tld {
private[analysis] object Tld {
def findTld(str: String): Option[String] =
known.find(str.endsWith)

View File

@ -1,10 +1,10 @@
package docspell.text.date
package docspell.analysis.date
import fs2._
import java.time.LocalDate
import docspell.common.{Language, NerDateLabel, NerLabel, NerTag}
import docspell.text.split.{TextSplitter, Word}
import fs2.{Pure, Stream}
import docspell.common._
import docspell.analysis.split._
import scala.util.Try
@ -21,7 +21,7 @@ object DateFind {
.map(sd =>
NerDateLabel(
sd.toLocalDate,
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)
NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end)
)
)
)

View File

@ -1,17 +1,17 @@
package docspell.text.nlp
package docspell.analysis.nlp
import java.net.URL
import java.util.zip.GZIPInputStream
import docspell.common.{Language, NerLabel, NerTag}
import edu.stanford.nlp.ie.AbstractSequenceClassifier
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import org.log4s.getLogger
import scala.jdk.CollectionConverters._
import org.log4s._
import docspell.common._
import java.net.URL
import scala.util.Using
import scala.jdk.CollectionConverters._
object StanfordNerClassifier {
private[this] val logger = getLogger

View File

@ -1,4 +1,4 @@
package docspell.text.split
package docspell.analysis.split
import fs2.Stream

View File

@ -1,4 +1,4 @@
package docspell.text.split
package docspell.analysis.split
case class Word(value: String, begin: Int, end: Int) {
def isEmpty: Boolean = value.isEmpty

View File

@ -0,0 +1,21 @@
package docspell.analysis
import cats.effect.{Blocker, IO}
import docspell.files._
import scala.concurrent.ExecutionContext
object TestFiles {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
lazy val letterDEText =
ExampleFiles.letter_de_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
lazy val letterENText =
ExampleFiles.letter_en_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
}

View File

@ -1,4 +1,4 @@
package docspell.text.contact
package docspell.analysis.contact
import docspell.common.{NerLabel, NerTag}
import minitest.SimpleTestSuite

View File

@ -1,8 +1,8 @@
package docspell.text.date
package docspell.analysis.date
import docspell.analysis.TestFiles
import minitest.SimpleTestSuite
import docspell.common.Language
import docspell.text.TestFiles
import minitest._
object DateFindSpec extends SimpleTestSuite {

View File

@ -1,8 +1,8 @@
package docspell.text.nlp
package docspell.analysis.nlp
import docspell.common.{Language, NerLabel, NerTag}
import docspell.text.TestFiles
import minitest.SimpleTestSuite
import docspell.analysis.TestFiles
import docspell.common._
object TextAnalyserSuite extends SimpleTestSuite {

View File

@ -1,6 +1,6 @@
package docspell.text.split
package docspell.analysis.split
import minitest._
import minitest.SimpleTestSuite
object TestSplitterSpec extends SimpleTestSuite {

View File

@ -1,4 +1,4 @@
package docspell.text.ocr
package docspell.extract.ocr
import java.nio.file.{Path, Paths}

View File

@ -1,4 +1,4 @@
package docspell.text.ocr
package docspell.extract.ocr
import java.nio.file.Path
@ -61,7 +61,7 @@ object Ocr {
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscript[F[_]: Sync: ContextShift](
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte],
cfg: Config,
wd: Path,
@ -92,7 +92,7 @@ object Ocr {
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
pdf: Path,
ghostscript: SystemCommand.Config,
wd: Path,
@ -120,7 +120,7 @@ object Ocr {
/** Run unpaper to optimize the image for ocr. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
img: Path,
unpaper: SystemCommand.Config,
wd: Path,
@ -146,7 +146,7 @@ object Ocr {
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractFile[F[_]: Sync: ContextShift](
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
img: Path,
blocker: Blocker,
lang: String,
@ -164,7 +164,7 @@ object Ocr {
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
img: Stream[F, Byte],
blocker: Blocker,
lang: String,

View File

@ -1,4 +1,4 @@
package docspell.text.ocr
package docspell.extract.ocr
import cats.effect.{Blocker, ContextShift, Sync}
import docspell.common._

View File

@ -1,4 +1,4 @@
package docspell.text
package docspell.extract
import fs2.Stream
import cats.effect.{Blocker, IO}

View File

@ -1,9 +1,9 @@
package docspell.text.ocr
package docspell.extract.ocr
import cats.effect.IO
import docspell.common._
import docspell.files._
import docspell.text.TestFiles
import docspell.extract.TestFiles
import minitest.SimpleTestSuite
object TextExtractionSuite extends SimpleTestSuite {

View File

@ -3,7 +3,7 @@ package docspell.joex
import docspell.common.{Ident, LenientUri}
import docspell.joex.scheduler.SchedulerConfig
import docspell.store.JdbcConfig
import docspell.text.ocr.{Config => OcrConfig}
import docspell.extract.ocr.{Config => OcrConfig}
import docspell.convert.ConvertConfig
case class Config(

View File

@ -5,11 +5,11 @@ import java.time.ZoneId
import cats.{Applicative, FlatMap}
import cats.implicits._
import cats.effect.Sync
import docspell.analysis.contact._
import docspell.common.MetaProposal.Candidate
import docspell.common._
import docspell.joex.scheduler.{Context, Task}
import docspell.store.records.{RAttachmentMeta, REquipment, ROrganization, RPerson}
import docspell.text.contact.Domain
import docspell.store.records._
/** Super simple approach to find corresponding meta data to an item
* by looking up values from NER in the users address book.

View File

@ -2,13 +2,13 @@ package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common.{Duration, Language, NerLabel, ProcessItemArgs}
import docspell.analysis.nlp._
import docspell.analysis.contact._
import docspell.analysis.date._
import docspell.common._
import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
import docspell.text.contact.Contact
import docspell.text.date.DateFind
import docspell.text.nlp.StanfordNerClassifier
object TextAnalysis {

View File

@ -7,7 +7,7 @@ import docspell.common._
import docspell.joex.scheduler.{Context, Task}
import docspell.store.Store
import docspell.store.records.{RAttachment, RAttachmentMeta}
import docspell.text.ocr.{TextExtract, Config => OcrConfig}
import docspell.extract.ocr.{TextExtract, Config => OcrConfig}
object TextExtraction {

View File

@ -25,6 +25,7 @@ object Dependencies {
val PoiVersion = "4.1.1"
val PostgresVersion = "42.2.10"
val PureConfigVersion = "0.12.2"
val Slf4jVersion = "1.7.30"
val SqliteVersion = "3.30.1"
val StanfordNlpVersion = "3.9.2"
val TikaVersion = "1.23"
@ -38,6 +39,8 @@ object Dependencies {
val poi = Seq(
"org.apache.poi" % "poi" % PoiVersion,
"org.apache.poi" % "poi-ooxml" % PoiVersion,
"org.slf4j" % "slf4j-log4j12" % Slf4jVersion,
"org.slf4j" % "slf4j-jcl" % Slf4jVersion
).map(_.excludeAll(
ExclusionRule("commons-logging"),
ExclusionRule("log4j")
@ -54,11 +57,17 @@ object Dependencies {
ExclusionRule("hamcrest-core")
))
val twelvemonkeys = Seq(
"com.twelvemonkeys.imageio" % "imageio-jpeg" % "3.5",
"com.twelvemonkeys.imageio" % "imageio-tiff" % "3.5"
)
val pdfbox = Seq(
"org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll(
ExclusionRule("commons-logging"),
ExclusionRule("org.bouncycastle")
)
),
"org.slf4j" % "slf4j-jcl" % Slf4jVersion
)
val emil = Seq(