mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-21 18:08:25 +00:00
Reorganize processing code
Use separate modules for - text extraction - conversion to pdf - text analysis
This commit is contained in:
@ -1,9 +1,10 @@
|
||||
package docspell.text.contact
|
||||
package docspell.analysis.contact
|
||||
|
||||
import fs2.Stream
|
||||
import cats.implicits._
|
||||
import docspell.common.{Ident, LenientUri, NerLabel, NerTag}
|
||||
import docspell.text.split.TextSplitter
|
||||
|
||||
import docspell.common._
|
||||
import docspell.analysis.split._
|
||||
|
||||
object Contact {
|
||||
private[this] val protocols = Set("ftp", "http", "https")
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.contact
|
||||
package docspell.analysis.contact
|
||||
|
||||
import cats.data.NonEmptyList
|
||||
import docspell.common.LenientUri
|
@ -1,6 +1,6 @@
|
||||
package docspell.text.contact
|
||||
package docspell.analysis.contact
|
||||
|
||||
private[text] object Tld {
|
||||
private[analysis] object Tld {
|
||||
|
||||
def findTld(str: String): Option[String] =
|
||||
known.find(str.endsWith)
|
@ -1,10 +1,10 @@
|
||||
package docspell.text.date
|
||||
package docspell.analysis.date
|
||||
|
||||
import fs2._
|
||||
import java.time.LocalDate
|
||||
|
||||
import docspell.common.{Language, NerDateLabel, NerLabel, NerTag}
|
||||
import docspell.text.split.{TextSplitter, Word}
|
||||
import fs2.{Pure, Stream}
|
||||
import docspell.common._
|
||||
import docspell.analysis.split._
|
||||
|
||||
import scala.util.Try
|
||||
|
||||
@ -21,7 +21,7 @@ object DateFind {
|
||||
.map(sd =>
|
||||
NerDateLabel(
|
||||
sd.toLocalDate,
|
||||
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)
|
||||
NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end)
|
||||
)
|
||||
)
|
||||
)
|
@ -1,17 +1,17 @@
|
||||
package docspell.text.nlp
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import java.net.URL
|
||||
import java.util.zip.GZIPInputStream
|
||||
|
||||
import docspell.common.{Language, NerLabel, NerTag}
|
||||
import edu.stanford.nlp.ie.AbstractSequenceClassifier
|
||||
import edu.stanford.nlp.ie.crf.CRFClassifier
|
||||
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
|
||||
import org.log4s.getLogger
|
||||
|
||||
import scala.jdk.CollectionConverters._
|
||||
import org.log4s._
|
||||
import docspell.common._
|
||||
|
||||
import java.net.URL
|
||||
import scala.util.Using
|
||||
import scala.jdk.CollectionConverters._
|
||||
|
||||
object StanfordNerClassifier {
|
||||
private[this] val logger = getLogger
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.split
|
||||
package docspell.analysis.split
|
||||
|
||||
import fs2.Stream
|
||||
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.split
|
||||
package docspell.analysis.split
|
||||
|
||||
case class Word(value: String, begin: Int, end: Int) {
|
||||
def isEmpty: Boolean = value.isEmpty
|
@ -0,0 +1,21 @@
|
||||
package docspell.analysis
|
||||
|
||||
import cats.effect.{Blocker, IO}
|
||||
import docspell.files._
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
object TestFiles {
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
lazy val letterDEText =
|
||||
ExampleFiles.letter_de_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
|
||||
lazy val letterENText =
|
||||
ExampleFiles.letter_en_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.contact
|
||||
package docspell.analysis.contact
|
||||
|
||||
import docspell.common.{NerLabel, NerTag}
|
||||
import minitest.SimpleTestSuite
|
@ -1,8 +1,8 @@
|
||||
package docspell.text.date
|
||||
package docspell.analysis.date
|
||||
|
||||
import docspell.analysis.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.common.Language
|
||||
import docspell.text.TestFiles
|
||||
import minitest._
|
||||
|
||||
object DateFindSpec extends SimpleTestSuite {
|
||||
|
@ -1,8 +1,8 @@
|
||||
package docspell.text.nlp
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import docspell.common.{Language, NerLabel, NerTag}
|
||||
import docspell.text.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.analysis.TestFiles
|
||||
import docspell.common._
|
||||
|
||||
object TextAnalyserSuite extends SimpleTestSuite {
|
||||
|
@ -1,6 +1,6 @@
|
||||
package docspell.text.split
|
||||
package docspell.analysis.split
|
||||
|
||||
import minitest._
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TestSplitterSpec extends SimpleTestSuite {
|
||||
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.ocr
|
||||
package docspell.extract.ocr
|
||||
|
||||
import java.nio.file.{Path, Paths}
|
||||
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.ocr
|
||||
package docspell.extract.ocr
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
@ -61,7 +61,7 @@ object Ocr {
|
||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[text] def runGhostscript[F[_]: Sync: ContextShift](
|
||||
private[extract] def runGhostscript[F[_]: Sync: ContextShift](
|
||||
pdf: Stream[F, Byte],
|
||||
cfg: Config,
|
||||
wd: Path,
|
||||
@ -92,7 +92,7 @@ object Ocr {
|
||||
/** Run ghostscript to extract all pdf pages into tiff files. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
|
||||
private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift](
|
||||
pdf: Path,
|
||||
ghostscript: SystemCommand.Config,
|
||||
wd: Path,
|
||||
@ -120,7 +120,7 @@ object Ocr {
|
||||
/** Run unpaper to optimize the image for ocr. The
|
||||
* files are stored to a temporary location on disk and returned.
|
||||
*/
|
||||
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](
|
||||
private[extract] def runUnpaperFile[F[_]: Sync: ContextShift](
|
||||
img: Path,
|
||||
unpaper: SystemCommand.Config,
|
||||
wd: Path,
|
||||
@ -146,7 +146,7 @@ object Ocr {
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
* text.
|
||||
*/
|
||||
private[text] def runTesseractFile[F[_]: Sync: ContextShift](
|
||||
private[extract] def runTesseractFile[F[_]: Sync: ContextShift](
|
||||
img: Path,
|
||||
blocker: Blocker,
|
||||
lang: String,
|
||||
@ -164,7 +164,7 @@ object Ocr {
|
||||
/** Run tesseract on the given image file and return the extracted
|
||||
* text.
|
||||
*/
|
||||
private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
|
||||
private[extract] def runTesseractStdin[F[_]: Sync: ContextShift](
|
||||
img: Stream[F, Byte],
|
||||
blocker: Blocker,
|
||||
lang: String,
|
@ -1,4 +1,4 @@
|
||||
package docspell.text.ocr
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.{Blocker, ContextShift, Sync}
|
||||
import docspell.common._
|
@ -1,4 +1,4 @@
|
||||
package docspell.text
|
||||
package docspell.extract
|
||||
|
||||
import fs2.Stream
|
||||
import cats.effect.{Blocker, IO}
|
@ -1,9 +1,9 @@
|
||||
package docspell.text.ocr
|
||||
package docspell.extract.ocr
|
||||
|
||||
import cats.effect.IO
|
||||
import docspell.common._
|
||||
import docspell.files._
|
||||
import docspell.text.TestFiles
|
||||
import docspell.extract.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TextExtractionSuite extends SimpleTestSuite {
|
@ -3,7 +3,7 @@ package docspell.joex
|
||||
import docspell.common.{Ident, LenientUri}
|
||||
import docspell.joex.scheduler.SchedulerConfig
|
||||
import docspell.store.JdbcConfig
|
||||
import docspell.text.ocr.{Config => OcrConfig}
|
||||
import docspell.extract.ocr.{Config => OcrConfig}
|
||||
import docspell.convert.ConvertConfig
|
||||
|
||||
case class Config(
|
||||
|
@ -5,11 +5,11 @@ import java.time.ZoneId
|
||||
import cats.{Applicative, FlatMap}
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.analysis.contact._
|
||||
import docspell.common.MetaProposal.Candidate
|
||||
import docspell.common._
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.records.{RAttachmentMeta, REquipment, ROrganization, RPerson}
|
||||
import docspell.text.contact.Domain
|
||||
import docspell.store.records._
|
||||
|
||||
/** Super simple approach to find corresponding meta data to an item
|
||||
* by looking up values from NER in the users address book.
|
||||
|
@ -2,13 +2,13 @@ package docspell.joex.process
|
||||
|
||||
import cats.implicits._
|
||||
import cats.effect.Sync
|
||||
import docspell.common.{Duration, Language, NerLabel, ProcessItemArgs}
|
||||
import docspell.analysis.nlp._
|
||||
import docspell.analysis.contact._
|
||||
import docspell.analysis.date._
|
||||
import docspell.common._
|
||||
import docspell.joex.process.ItemData.AttachmentDates
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.records.RAttachmentMeta
|
||||
import docspell.text.contact.Contact
|
||||
import docspell.text.date.DateFind
|
||||
import docspell.text.nlp.StanfordNerClassifier
|
||||
|
||||
object TextAnalysis {
|
||||
|
||||
|
@ -7,7 +7,7 @@ import docspell.common._
|
||||
import docspell.joex.scheduler.{Context, Task}
|
||||
import docspell.store.Store
|
||||
import docspell.store.records.{RAttachment, RAttachmentMeta}
|
||||
import docspell.text.ocr.{TextExtract, Config => OcrConfig}
|
||||
import docspell.extract.ocr.{TextExtract, Config => OcrConfig}
|
||||
|
||||
object TextExtraction {
|
||||
|
||||
|
Reference in New Issue
Block a user