Reorganize processing code

Use separate modules for

- text extraction
- conversion to pdf
- text analysis
This commit is contained in:
Eike Kettner
2020-02-15 16:40:50 +01:00
parent 919381be1e
commit 851ee7ef0f
24 changed files with 103 additions and 60 deletions

View File

@ -1,9 +1,10 @@
package docspell.text.contact
package docspell.analysis.contact
import fs2.Stream
import cats.implicits._
import docspell.common.{Ident, LenientUri, NerLabel, NerTag}
import docspell.text.split.TextSplitter
import docspell.common._
import docspell.analysis.split._
object Contact {
private[this] val protocols = Set("ftp", "http", "https")

View File

@ -1,4 +1,4 @@
package docspell.text.contact
package docspell.analysis.contact
import cats.data.NonEmptyList
import docspell.common.LenientUri

View File

@ -1,6 +1,6 @@
package docspell.text.contact
package docspell.analysis.contact
private[text] object Tld {
private[analysis] object Tld {
def findTld(str: String): Option[String] =
known.find(str.endsWith)

View File

@ -1,10 +1,10 @@
package docspell.text.date
package docspell.analysis.date
import fs2._
import java.time.LocalDate
import docspell.common.{Language, NerDateLabel, NerLabel, NerTag}
import docspell.text.split.{TextSplitter, Word}
import fs2.{Pure, Stream}
import docspell.common._
import docspell.analysis.split._
import scala.util.Try
@ -21,7 +21,7 @@ object DateFind {
.map(sd =>
NerDateLabel(
sd.toLocalDate,
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)
NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end)
)
)
)

View File

@ -1,17 +1,17 @@
package docspell.text.nlp
package docspell.analysis.nlp
import java.net.URL
import java.util.zip.GZIPInputStream
import docspell.common.{Language, NerLabel, NerTag}
import edu.stanford.nlp.ie.AbstractSequenceClassifier
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import org.log4s.getLogger
import scala.jdk.CollectionConverters._
import org.log4s._
import docspell.common._
import java.net.URL
import scala.util.Using
import scala.jdk.CollectionConverters._
object StanfordNerClassifier {
private[this] val logger = getLogger

View File

@ -1,4 +1,4 @@
package docspell.text.split
package docspell.analysis.split
import fs2.Stream

View File

@ -1,4 +1,4 @@
package docspell.text.split
package docspell.analysis.split
case class Word(value: String, begin: Int, end: Int) {
def isEmpty: Boolean = value.isEmpty

View File

@ -0,0 +1,21 @@
package docspell.analysis
import cats.effect.{Blocker, IO}
import docspell.files._
import scala.concurrent.ExecutionContext
object TestFiles {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
lazy val letterDEText =
ExampleFiles.letter_de_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
lazy val letterENText =
ExampleFiles.letter_en_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
}

View File

@ -1,4 +1,4 @@
package docspell.text.contact
package docspell.analysis.contact
import docspell.common.{NerLabel, NerTag}
import minitest.SimpleTestSuite

View File

@ -1,8 +1,8 @@
package docspell.text.date
package docspell.analysis.date
import docspell.analysis.TestFiles
import minitest.SimpleTestSuite
import docspell.common.Language
import docspell.text.TestFiles
import minitest._
object DateFindSpec extends SimpleTestSuite {

View File

@ -1,8 +1,8 @@
package docspell.text.nlp
package docspell.analysis.nlp
import docspell.common.{Language, NerLabel, NerTag}
import docspell.text.TestFiles
import minitest.SimpleTestSuite
import docspell.analysis.TestFiles
import docspell.common._
object TextAnalyserSuite extends SimpleTestSuite {

View File

@ -1,6 +1,6 @@
package docspell.text.split
package docspell.analysis.split
import minitest._
import minitest.SimpleTestSuite
object TestSplitterSpec extends SimpleTestSuite {

View File

@ -1,4 +1,4 @@
package docspell.text.ocr
package docspell.extract.ocr
import java.nio.file.{Path, Paths}

View File

@ -1,4 +1,4 @@
package docspell.text.ocr
package docspell.extract.ocr
import java.nio.file.Path
@ -61,7 +61,7 @@ object Ocr {
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscript[F[_]: Sync: ContextShift](
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte],
cfg: Config,
wd: Path,
@ -92,7 +92,7 @@ object Ocr {
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
pdf: Path,
ghostscript: SystemCommand.Config,
wd: Path,
@ -120,7 +120,7 @@ object Ocr {
/** Run unpaper to optimize the image for ocr. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
img: Path,
unpaper: SystemCommand.Config,
wd: Path,
@ -146,7 +146,7 @@ object Ocr {
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractFile[F[_]: Sync: ContextShift](
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
img: Path,
blocker: Blocker,
lang: String,
@ -164,7 +164,7 @@ object Ocr {
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
img: Stream[F, Byte],
blocker: Blocker,
lang: String,

View File

@ -1,4 +1,4 @@
package docspell.text.ocr
package docspell.extract.ocr
import cats.effect.{Blocker, ContextShift, Sync}
import docspell.common._

View File

@ -1,4 +1,4 @@
package docspell.text
package docspell.extract
import fs2.Stream
import cats.effect.{Blocker, IO}

View File

@ -1,9 +1,9 @@
package docspell.text.ocr
package docspell.extract.ocr
import cats.effect.IO
import docspell.common._
import docspell.files._
import docspell.text.TestFiles
import docspell.extract.TestFiles
import minitest.SimpleTestSuite
object TextExtractionSuite extends SimpleTestSuite {

View File

@ -3,7 +3,7 @@ package docspell.joex
import docspell.common.{Ident, LenientUri}
import docspell.joex.scheduler.SchedulerConfig
import docspell.store.JdbcConfig
import docspell.text.ocr.{Config => OcrConfig}
import docspell.extract.ocr.{Config => OcrConfig}
import docspell.convert.ConvertConfig
case class Config(

View File

@ -5,11 +5,11 @@ import java.time.ZoneId
import cats.{Applicative, FlatMap}
import cats.implicits._
import cats.effect.Sync
import docspell.analysis.contact._
import docspell.common.MetaProposal.Candidate
import docspell.common._
import docspell.joex.scheduler.{Context, Task}
import docspell.store.records.{RAttachmentMeta, REquipment, ROrganization, RPerson}
import docspell.text.contact.Domain
import docspell.store.records._
/** Super simple approach to find corresponding meta data to an item
* by looking up values from NER in the users address book.

View File

@ -2,13 +2,13 @@ package docspell.joex.process
import cats.implicits._
import cats.effect.Sync
import docspell.common.{Duration, Language, NerLabel, ProcessItemArgs}
import docspell.analysis.nlp._
import docspell.analysis.contact._
import docspell.analysis.date._
import docspell.common._
import docspell.joex.process.ItemData.AttachmentDates
import docspell.joex.scheduler.Task
import docspell.store.records.RAttachmentMeta
import docspell.text.contact.Contact
import docspell.text.date.DateFind
import docspell.text.nlp.StanfordNerClassifier
object TextAnalysis {

View File

@ -7,7 +7,7 @@ import docspell.common._
import docspell.joex.scheduler.{Context, Task}
import docspell.store.Store
import docspell.store.records.{RAttachment, RAttachmentMeta}
import docspell.text.ocr.{TextExtract, Config => OcrConfig}
import docspell.extract.ocr.{TextExtract, Config => OcrConfig}
object TextExtraction {