Initial version.

Features:

- Upload PDF files let them analyze

- Manage meta data and items

- See processing in webapp
This commit is contained in:
Eike Kettner
2019-07-23 00:53:30 +02:00
parent 6154e6a387
commit 831cd8b655
341 changed files with 23634 additions and 484 deletions

View File

@ -0,0 +1,51 @@
package docspell.text.contact
import fs2.Stream
import cats.implicits._
import docspell.common.{Ident, LenientUri, NerLabel, NerTag}
import docspell.text.split.TextSplitter
object Contact {
private[this] val protocols = Set("ftp", "http", "https")
def annotate(text: String): Vector[NerLabel] =
TextSplitter.splitToken[Nothing](text, " \t\r\n".toSet).
map({ token =>
if (isEmailAddress(token.value)) NerLabel(token.value, NerTag.Email, token.begin, token.end).some
else if (isWebsite(token.value)) NerLabel(token.value, NerTag.Website, token.begin, token.end).some
else None
}).
flatMap(_.map(Stream.emit).getOrElse(Stream.empty)).
toVector
def isEmailAddress(str: String): Boolean = {
val atIdx = str.indexOf('@')
if (atIdx <= 0 || str.indexOf('@', atIdx + 1) > 0) false
else {
val name = str.substring(0, atIdx)
val dom = str.substring(atIdx + 1)
Domain.isDomain(dom) && name.forall(c => !c.isWhitespace)
}
}
def isWebsite(str: String): Boolean =
LenientUri.parse(str).
toOption.
map(uri => protocols.contains(uri.scheme.head)).
getOrElse(Domain.isDomain(str))
def isDocspellOpenUpload(str: String): Boolean = {
def isUploadPath(p: LenientUri.Path): Boolean =
p match {
case LenientUri.RootPath => false
case LenientUri.EmptyPath => false
case LenientUri.NonEmptyPath(segs) =>
Ident.fromString(segs.last).isRight &&
segs.init.takeRight(3) == List("open", "upload", "item")
}
LenientUri.parse(str).
toOption.
exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path))
}
}

View File

@ -0,0 +1,40 @@
package docspell.text.contact
import cats.data.NonEmptyList
import docspell.common.LenientUri
case class Domain(labels: NonEmptyList[String], tld: String) {
def asString: String =
labels.toList.mkString(".") + tld
def toPrimaryDomain: Domain =
if (labels.tail.isEmpty) this
else Domain(NonEmptyList.of(labels.last), tld)
}
object Domain {
def domainFromUri(uri: String): Either[String, Domain] =
LenientUri.parse(if (uri.contains("://")) uri else s"http://$uri").
flatMap(uri => uri.authority.toRight("Uri has no authoriry part")).
flatMap(auth => parse(auth))
def parse(str: String): Either[String, Domain] = {
Tld.findTld(str).
map(tld => (str.dropRight(tld.length), tld)).
map({ case (names, tld) =>
names.split('.').toList match {
case Nil => Left(s"Not a domain: $str")
case segs if segs.forall(label =>
label.trim.nonEmpty && label.forall(c => c.isLetter || c.isDigit || c == '-')) =>
Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
case _ => Left(s"Not a domain: $str")
}
}).
getOrElse(Left(s"Not a domain $str"))
}
def isDomain(str: String): Boolean =
parse(str).isRight
}

View File

@ -0,0 +1,83 @@
package docspell.text.contact
private[text] object Tld {
def findTld(str: String): Option[String] =
known.find(str.endsWith)
def endsWithTld(str: String): Boolean =
findTld(str).isDefined
/**
* Some selected TLDs.
*/
private [this] val known = List(
".com",
".org",
".net",
".int",
".edu",
".gov",
".mil",
".ad",
".ae",
".al",
".am",
".ar",
".as",
".at",
".au",
".ax",
".ba",
".bd",
".be",
".bg",
".br",
".by",
".bz",
".ca",
".cc",
".ch",
".cn",
".co",
".cu",
".cx",
".cy",
".de",
".dk",
".dj",
".ee",
".eu",
".fi",
".fr",
".gr",
".hk",
".hr",
".hu",
".ie",
".il",
".io",
".is",
".ir",
".it",
".jp",
".li",
".lt",
".mt",
".no",
".nz",
".pl",
".pt",
".ru",
".rs",
".se",
".si",
".sk",
".th",
".ua",
".uk",
".us",
".ws"
)
}

View File

@ -0,0 +1,125 @@
package docspell.text.date
import fs2._
import java.time.LocalDate
import docspell.common.{Language, NerDateLabel, NerLabel, NerTag}
import docspell.text.split.{TextSplitter, Word}
import scala.util.Try
object DateFind {
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = {
TextSplitter.splitToken(text, " \t.,\n\r/".toSet).
sliding(3).
filter(_.length == 3).
map(q => SimpleDate.fromParts(q.toList, lang).
map(sd => NerDateLabel(sd.toLocalDate,
NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end)))).
collect({ case Some(d) => d })
}
private case class SimpleDate(year: Int, month: Int, day: Int) {
def toLocalDate: LocalDate =
LocalDate.of(if (year < 100) 1900 + year else year, month, day)
}
private object SimpleDate {
val p0 = readYear >> readMonth >> readDay map {
case ((y, m), d) => SimpleDate(y, m, d)
}
val p1 = readDay >> readMonth >> readYear map {
case ((d, m), y) => SimpleDate(y, m, d)
}
val p2 = readMonth >> readDay >> readYear map {
case ((m, d), y) => SimpleDate(y, m, d)
}
// ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔
def fromParts(parts: List[Word], lang: Language): Option[SimpleDate] = {
val p = lang match {
case Language.English => p2.or(p0).or(p1)
case Language.German => p1.or(p0).or(p2)
}
p.read(parts).toOption
}
def readYear: Reader[Int] = {
Reader.readFirst(w => w.value.length match {
case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption
case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption
case _ => None
})
}
def readMonth: Reader[Int] =
Reader.readFirst(w => Some(months.indexWhere(_.contains(w.value))).filter(_ > 0).map(_ + 1))
def readDay: Reader[Int] =
Reader.readFirst(w => Try(w.value.toInt).filter(n => n > 0 && n <= 31).toOption)
case class Reader[A](read: List[Word] => Result[A]) {
def >>[B](next: Reader[B]): Reader[(A, B)] =
Reader(read.andThen(_.next(next)))
def map[B](f: A => B): Reader[B] =
Reader(read.andThen(_.map(f)))
def or(other: Reader[A]): Reader[A] =
Reader(words => read(words) match {
case Result.Failure => other.read(words)
case s @ Result.Success(_, _) => s
})
}
object Reader {
def fail[A]: Reader[A] =
Reader(_ => Result.Failure)
def readFirst[A](f: Word => Option[A]): Reader[A] =
Reader({
case Nil => Result.Failure
case a :: as => f(a).map(value => Result.Success(value, as)).getOrElse(Result.Failure)
})
}
sealed trait Result[+A] {
def toOption: Option[A]
def map[B](f: A => B): Result[B]
def next[B](r: Reader[B]): Result[(A, B)]
}
object Result {
final case class Success[A](value: A, rest: List[Word]) extends Result[A] {
val toOption = Some(value)
def map[B](f: A => B): Result[B] = Success(f(value), rest)
def next[B](r: Reader[B]): Result[(A, B)] =
r.read(rest).map(b => (value, b))
}
final case object Failure extends Result[Nothing] {
val toOption = None
def map[B](f: Nothing => B): Result[B] = this
def next[B](r: Reader[B]): Result[(Nothing, B)] = this
}
}
private val months = List(
List("jan", "january", "januar", "01"),
List("feb", "february", "februar", "02"),
List("mar", "march", "märz", "marz", "03"),
List("apr", "april", "04"),
List("may", "mai", "05"),
List("jun", "june", "juni", "06"),
List("jul", "july", "juli", "07"),
List("aug", "august", "08"),
List("sep", "september", "09"),
List("oct", "october", "oktober", "10"),
List("nov", "november", "11"),
List("dec", "december", "dezember", "12")
)
}
}

View File

@ -0,0 +1,56 @@
package docspell.text.nlp
import java.util.zip.GZIPInputStream
import docspell.common.{Language, NerLabel, NerTag}
import edu.stanford.nlp.ie.AbstractSequenceClassifier
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import scala.jdk.CollectionConverters._
import org.log4s._
import java.net.URL
import scala.util.Using
object StanfordNerClassifier {
private [this] val logger = getLogger
lazy val germanNerClassifier = makeClassifier(Language.German)
lazy val englishNerClassifier = makeClassifier(Language.English)
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
val nerClassifier = lang match {
case Language.English => englishNerClassifier
case Language.German => germanNerClassifier
}
nerClassifier.classify(text).asScala.flatMap(a => a.asScala).
collect(Function.unlift(label => {
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
NerTag.fromString(Option(tag).getOrElse("")).toOption.
map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})).
toVector
}
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
val ner = classifierResource(lang)
Using(new GZIPInputStream(ner.openStream())) { in =>
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
}.fold(throw _, identity)
}
private def classifierResource(lang: Language): URL = {
def check(u: URL): URL =
if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
else u
check(lang match {
case Language.German =>
getClass.getResource("/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz")
case Language.English =>
getClass.getResource("/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz")
})
}
}

View File

@ -0,0 +1,66 @@
package docspell.text.ocr
import java.nio.file.{Path, Paths}
import docspell.common._
case class Config(
allowedContentTypes: Set[MimeType]
, ghostscript: Config.Ghostscript
, pageRange: Config.PageRange
, unpaper: Config.Unpaper
, tesseract: Config.Tesseract
) {
def isAllowed(mt: MimeType): Boolean =
allowedContentTypes contains mt
}
object Config {
case class PageRange(begin: Int)
case class Command(program: String, args: Seq[String], timeout: Duration) {
def mapArgs(f: String => String): Command =
Command(program, args map f, timeout)
def toCmd: List[String] =
program :: args.toList
lazy val cmdString: String =
toCmd.mkString(" ")
}
case class Ghostscript(command: Command, workingDir: Path)
case class Tesseract(command: Command)
case class Unpaper(command: Command)
val default = Config(
allowedContentTypes = Set(
MimeType.pdf,
MimeType.png,
MimeType.jpeg,
MimeType.tiff
),
pageRange = PageRange(10),
ghostscript = Ghostscript(
Command("gs", Seq("-dNOPAUSE"
, "-dBATCH"
, "-dSAFER"
, "-sDEVICE=tiffscaled8"
, "-sOutputFile={{outfile}}"
, "{{infile}}"),
Duration.seconds(30)),
Paths.get(System.getProperty("java.io.tmpdir")).
resolve("docspell-extraction")),
unpaper = Unpaper(Command("unpaper"
, Seq("{{infile}}", "{{outfile}}")
, Duration.seconds(30))),
tesseract = Tesseract(
Command("tesseract", Seq("{{file}}"
, "stdout"
, "-l"
, "{{lang}}"),
Duration.minutes(1)))
)
}

View File

@ -0,0 +1,56 @@
package docspell.text.ocr
import cats.implicits._
import scala.jdk.CollectionConverters._
import java.io.IOException
import java.nio.file.attribute.BasicFileAttributes
import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor}
import java.util.concurrent.atomic.AtomicInteger
import cats.effect.Sync
import fs2.Stream
object File {
def mkDir[F[_]: Sync](dir: Path): F[Path] =
Sync[F].delay(Files.createDirectories(dir))
def mkTempDir[F[_]: Sync](parent: Path, prefix: String): F[Path] =
mkDir(parent).map(p => Files.createTempDirectory(p, prefix))
def deleteDirectory[F[_]: Sync](dir: Path): F[Int] = Sync[F].delay {
val count = new AtomicInteger(0)
Files.walkFileTree(dir, new SimpleFileVisitor[Path]() {
override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = {
Files.deleteIfExists(file)
count.incrementAndGet()
FileVisitResult.CONTINUE
}
override def postVisitDirectory(dir: Path, e: IOException): FileVisitResult =
Option(e) match {
case Some(ex) => throw ex
case None =>
Files.deleteIfExists(dir)
FileVisitResult.CONTINUE
}
})
count.get
}
def deleteFile[F[_]: Sync](file: Path): F[Unit] =
Sync[F].delay(Files.deleteIfExists(file)).map(_ => ())
def delete[F[_]: Sync](path: Path): F[Int] =
if (Files.isDirectory(path)) deleteDirectory(path)
else deleteFile(path).map(_ => 1)
def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)
(f: Path => Stream[F, A]): Stream[F, A] =
Stream.bracket(mkTempDir(parent, prefix))(p => delete(p).map(_ => ())).flatMap(f)
def listFiles[F[_]: Sync](pred: Path => Boolean, dir: Path): F[List[Path]] = Sync[F].delay {
val javaList = Files.list(dir).filter(p => pred(p)).collect(java.util.stream.Collectors.toList())
javaList.asScala.toList.sortBy(_.getFileName.toString)
}
}

View File

@ -0,0 +1,9 @@
package docspell.text.ocr
case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {
}
object MimeTypeHint {
val none = MimeTypeHint(None, None)
}

View File

@ -0,0 +1,148 @@
package docspell.text.ocr
import java.nio.file.Path
import cats.effect.{Blocker, ContextShift, Sync}
import fs2.Stream
import org.log4s._
object Ocr {
private[this] val logger = getLogger
/** Extract the text of all pages in the given pdf file.
*/
def extractPdf[F[_]: Sync: ContextShift](pdf: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscript(pdf, config, wd, blocker).
flatMap({ tmpImg =>
runTesseractFile(tmpImg, blocker, lang, config)
}).
fold1(_ + "\n\n\n" + _)
}
/** Extract the text from the given image file
*/
def extractImage[F[_]: Sync: ContextShift](img: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
runTesseractStdin(img, blocker, lang, config)
def extractPdFFile[F[_]: Sync: ContextShift](pdf: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] =
File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd =>
runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker).
flatMap({ tif =>
runTesseractFile(tif, blocker, lang, config)
}).
fold1(_ + "\n\n\n" + _)
}
def extractImageFile[F[_]: Sync: ContextShift](img: Path, blocker: Blocker, lang: String, config: Config): Stream[F, String] =
runTesseractFile(img, blocker, lang, config)
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscript[F[_]: Sync: ContextShift](
pdf: Stream[F, Byte]
, cfg: Config
, wd: Path
, blocker: Blocker): Stream[F, Path] = {
val xargs =
if (cfg.pageRange.begin > 0) s"-dLastPage=${cfg.pageRange.begin}" +: cfg.ghostscript.command.args
else cfg.ghostscript.command.args
val cmd = cfg.ghostscript.command.copy(args = xargs).mapArgs(replace(Map(
"{{infile}}" -> "-",
"{{outfile}}" -> "%d.tif"
)))
SystemCommand.execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf).
evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd)
}).
flatMap(fs => Stream.emits(fs))
}
/** Run ghostscript to extract all pdf pages into tiff files. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runGhostscriptFile[F[_]: Sync: ContextShift](
pdf: Path
, ghostscript: Config.Command
, wd: Path, blocker: Blocker): Stream[F, Path] = {
val cmd = ghostscript.mapArgs(replace(Map(
"{{infile}}" -> pdf.toAbsolutePath.toString,
"{{outfile}}" -> "%d.tif"
)))
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).
evalMap({ _ =>
File.listFiles(pathEndsWith(".tif"), wd)
}).
flatMap(fs => Stream.emits(fs))
}
private def pathEndsWith(ext: String): Path => Boolean =
p => p.getFileName.toString.endsWith(ext)
/** Run unpaper to optimize the image for ocr. The
* files are stored to a temporary location on disk and returned.
*/
private[text] def runUnpaperFile[F[_]: Sync: ContextShift](img: Path
, unpaper: Config.Command
, wd: Path, blocker: Blocker): Stream[F, Path] = {
val targetFile = img.resolveSibling("u-"+ img.getFileName.toString).toAbsolutePath
val cmd = unpaper.mapArgs(replace(Map(
"{{infile}}" -> img.toAbsolutePath.toString,
"{{outfile}}" -> targetFile.toString
)))
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).
map(_ => targetFile).
handleErrorWith(th => {
logger.warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.")
Stream.emit(img)
})
}
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractFile[F[_]: Sync: ContextShift](
img: Path
, blocker: Blocker
, lang: String
, config: Config): Stream[F, String] = {
// tesseract cannot cope with absolute filenames
// so use the parent as working dir
runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).
flatMap(uimg => {
val cmd = config.tesseract.command.mapArgs(replace(Map(
"{{file}}" -> uimg.getFileName.toString
, "{{lang}}" -> fixLanguage(lang))))
SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout)
})
}
/** Run tesseract on the given image file and return the extracted
* text.
*/
private[text] def runTesseractStdin[F[_]: Sync: ContextShift](
img: Stream[F, Byte]
, blocker: Blocker
, lang: String
, config: Config): Stream[F, String] = {
val cmd = config.tesseract.command.mapArgs(replace(Map(
"{{file}}" -> "stdin"
, "{{lang}}" -> fixLanguage(lang))))
SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout)
}
private def replace(repl: Map[String, String]): String => String =
s => repl.foldLeft(s) { case (res, (k, v)) =>
res.replace(k, v)
}
private def fixLanguage(lang: String): String =
lang match {
case "de" => "deu"
case "en" => "eng"
case l => l
}
}

View File

@ -0,0 +1,72 @@
package docspell.text.ocr
import java.io.InputStream
import java.nio.file.Path
import java.util.concurrent.TimeUnit
import cats.implicits._
import cats.effect.{Blocker, ContextShift, Sync}
import fs2.{Stream, io, text}
import org.log4s.getLogger
import scala.jdk.CollectionConverters._
import docspell.common.syntax.all._
object SystemCommand {
private[this] val logger = getLogger
final case class Result(rc: Int, stdout: String, stderr: String)
def exec[F[_]: Sync: ContextShift]( cmd: Config.Command
, blocker: Blocker
, wd: Option[Path] = None
, stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] =
startProcess(cmd, wd){ proc =>
Stream.eval {
for {
_ <- writeToProcess(stdin, proc, blocker)
term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS))
_ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}")
else logger.fwarn(s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!")
_ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(())
out <- if (term) inputStreamToString(proc.getInputStream, blocker) else Sync[F].pure("")
err <- if (term) inputStreamToString(proc.getErrorStream, blocker) else Sync[F].pure("")
} yield Result(proc.exitValue, out, err)
}
}
def execSuccess[F[_]: Sync: ContextShift](cmd: Config.Command, blocker: Blocker, wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty): Stream[F, Result] =
exec(cmd, blocker, wd, stdin).flatMap { r =>
if (r.rc != 0) Stream.raiseError[F](new Exception(s"Command `${cmd.cmdString}` returned non-zero exit code ${r.rc}. Stderr: ${r.stderr}"))
else Stream.emit(r)
}
private def startProcess[F[_]: Sync,A](cmd: Config.Command, wd: Option[Path])(f: Process => Stream[F,A]): Stream[F, A] = {
val log = logger.fdebug(s"Running external command: ${cmd.cmdString}")
val proc = log *> Sync[F].delay {
val pb = new ProcessBuilder(cmd.toCmd.asJava)
wd.map(_.toFile).foreach(pb.directory)
pb.start()
}
Stream.bracket(proc)(p => logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ =>
p.destroy()
}).flatMap(f)
}
private def inputStreamToString[F[_]: Sync: ContextShift](in: InputStream, blocker: Blocker): F[String] =
io.readInputStream(Sync[F].pure(in), 16 * 1024, blocker, closeAfterUse = false).
through(text.utf8Decode).
chunks.
map(_.toVector.mkString).
fold1(_ + _).
compile.last.
map(_.getOrElse(""))
private def writeToProcess[F[_]: Sync: ContextShift](data: Stream[F, Byte], proc: Process, blocker: Blocker): F[Unit] =
data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).
compile.drain
private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] =
Sync[F].delay(proc.destroyForcibly()).attempt *> {
Sync[F].raiseError(new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})"))
}
}

View File

@ -0,0 +1,30 @@
package docspell.text.ocr
import cats.effect.{Blocker, ContextShift, Sync}
import docspell.common.MimeType
import fs2.Stream
object TextExtract {
def extract[F[_]: Sync: ContextShift](in: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
extractOCR(in, blocker, lang, config)
def extractOCR[F[_]: Sync: ContextShift](in: Stream[F, Byte], blocker: Blocker, lang: String, config: Config): Stream[F, String] =
Stream.eval(TikaMimetype.detect(in)).
flatMap({
case mt if !config.isAllowed(mt) =>
raiseError(s"File `$mt` not allowed")
case MimeType.pdf =>
Ocr.extractPdf(in, blocker, lang, config)
case mt if mt.primary == "image" =>
Ocr.extractImage(in, blocker, lang, config)
case mt =>
raiseError(s"File `$mt` not supported")
})
private def raiseError[F[_]: Sync](msg: String): Stream[F, Nothing] =
Stream.raiseError[F](new Exception(msg))
}

View File

@ -0,0 +1,45 @@
package docspell.text.ocr
import cats.implicits._
import cats.effect.Sync
import docspell.common.MimeType
import fs2.Stream
import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys}
import org.apache.tika.mime.MediaType
object TikaMimetype {
private val tika = new TikaConfig().getDetector
private def convert(mt: MediaType): MimeType =
Option(mt).map(_.toString).
map(MimeType.parse).
flatMap(_.toOption).
map(normalize).
getOrElse(MimeType.octetStream)
private def makeMetadata(hint: MimeTypeHint): Metadata = {
val md = new Metadata
hint.filename.
foreach(md.set(TikaMetadataKeys.RESOURCE_NAME_KEY, _))
hint.advertised.
foreach(md.set(HttpHeaders.CONTENT_TYPE, _))
md
}
private def normalize(in: MimeType): MimeType = in match {
case MimeType(_, sub) if sub contains "xhtml" =>
MimeType.html
case _ => in
}
private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = {
convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint)))
}
def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] =
data.take(1024).
compile.toVector.
map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none))
}

View File

@ -0,0 +1,30 @@
package docspell.text.split
import fs2.Stream
/** Splits text into words.
*
*/
object TextSplitter {
private[this] val trimChars =
".,…_[]^!<>=&ſ/{}*?()-:#$|~`+%\\\"'; \t\r\n".toSet
def split[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
val indexes = sep.map(c => str.indexOf(c.toInt)).filter(_ >= 0)
val index = if (indexes.isEmpty) - 1 else indexes.min
if (index < 0) Stream.emit(Word(str, start, start + str.length))
else if (index == 0) split(str.substring(1), sep, start + 1)
else Stream.emit(Word(str.substring(0, index), start, start + index)) ++
Stream.suspend(split(str.substring(index + 1), sep, start + index + 1))
}
def splitToken[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
split(str, sep, start).
map(w => w.trim(trimChars)).
filter(_.nonEmpty).
map(_.toLower)
}
}

View File

@ -0,0 +1,32 @@
package docspell.text.split
case class Word(value: String, begin: Int, end: Int) {
def isEmpty: Boolean = value.isEmpty
def nonEmpty: Boolean = !isEmpty
def length : Int = value.length
def trimLeft(chars: Set[Char]): Word = {
val v = value.dropWhile(chars.contains)
if (v == value) this
else Word(v, begin + length - v.length, end)
}
def trimRight(chars: Set[Char]): Word = {
@annotation.tailrec
def findIndex(n: Int = length - 1): Int =
if (n < 0 || !chars.contains(value.charAt(n))) n
else findIndex(n - 1)
val index = findIndex()
if (index == length - 1) this
else if (index < 0) Word("", begin, begin + 1)
else Word(value.substring(0, index + 1), begin, end - index)
}
def trim(chars: Set[Char]): Word =
trimLeft(chars).trimRight(chars)
def toLower: Word =
copy(value = value.toLowerCase)
}

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,14 @@
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<withJansi>true</withJansi>
<encoder>
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
</encoder>
</appender>
<logger name="docspell" level="debug" />
<root level="INFO">
<appender-ref ref="STDOUT" />
</root>
</configuration>

View File

@ -0,0 +1,94 @@
package docspell.text
import cats.effect.{Blocker, IO}
import docspell.common.LenientUri
import fs2.Stream
import scala.concurrent.ExecutionContext
object TestFiles {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
val letterSourceDE: Stream[IO, Byte] =
LenientUri.fromJava(getClass.getResource("/letter-de-source.pdf")).
readURL[IO](16 * 1024, blocker)
val letterSourceEN: Stream[IO, Byte] =
LenientUri.fromJava(getClass.getResource("/letter-en-source.pdf")).
readURL[IO](16 * 1024, blocker)
val letterDEText = """Max Mustermann
|
|Lilienweg 21
|
|12345 Nebendorf
|
|E-Mail: max.muster@gmail.com
|
|Max Mustermann, Lilienweg 21, 12345 Nebendorf
|
|EasyCare AG
|Abteilung Buchhaltung
|Ackerweg 12
|
|12346 Ulmen
|
|Nebendorf, 3. September 2019
|Sehr geehrte Damen und Herren,
|
|hiermit kündige ich meine Mitgliedschaft in der Kranken- und Pflegeversicherung zum
|nächstmöglichen Termin.
|
|Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen eine Kündigungsbe-
|stätigung zu.
|
|Vielen Dank im Vorraus!
|
|Mit freundlichen Grüßen
|
|Max Mustermann
|""".stripMargin.trim
val letterENText = """Derek Jeter
|
|123 Elm Ave.
|
|Treesville, ON MI1N 2P3
|November 7, 2016
|
|Derek Jeter, 123 Elm Ave., Treesville, ON M1N 2P3, November 7, 2016
|
|Mr. M. Leat
|
|Chief of Syrup Production
|Old Sticky Pancake Company
|456 Maple Lane
|
|Forest, ON 7TW8 9Y0
|
|Hemptown, September 3, 2019
|Dear Mr. Leaf,
|
|Let me begin by thanking you for your past contributions to our Little League baseball
|team. Your sponsorship aided in the purchase of ten full uniforms and several pieces of
|baseball equipment for last years season.
|
|Next month, our company is planning an employee appreciation pancake breakfast hon-
|oring retired employees for their past years of service and present employees for their
|loyalty and dedication in spite of the current difficult economic conditions.
|
|We would like to place an order with your company for 25 pounds of pancake mix and
|five gallons of maple syrup. We hope you will be able to provide these products in the
|bulk quantities we require.
|
|As you are a committed corporate sponsor and long-time associate, we hope that you
|will be able to join us for breakfast on December 12, 2016.
|
|Respectfully yours,
|
|Derek Jeter
|""".stripMargin.trim
}

View File

@ -0,0 +1,32 @@
package docspell.text.contact
import docspell.common.{NerLabel, NerTag}
import minitest.SimpleTestSuite
object ContactAnnotateSpec extends SimpleTestSuite {
test("find email") {
val text =
"""An email address such as John.Smith@example.com is made up
|of a local-part, an @ symbol, then a case-insensitive domain.
|Although the standard requires[1] the local part to be
|case-sensitive, it also urges that receiving hosts deliver
|messages in a case-independent fashion,[2] e.g., that the mail
|system at example.com treat John.Smith as equivalent to
|john.smith; some mail systems even treat them as equivalent
|to johnsmith.[3] Mail systems often limit their users' choice
|of name to a subset of the technically valid characters, and
|in some cases also limit which addresses it is possible to
|send mail to.""".stripMargin
val labels = Contact.annotate(text)
assertEquals(labels.size, 2)
assertEquals(labels(0),
NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
assertEquals(text.substring(25, 47).toLowerCase, "john.smith@example.com")
assertEquals(labels(1),
NerLabel("example.com", NerTag.Website, 308, 319))
assertEquals(text.substring(308, 319).toLowerCase, "example.com")
}
}

View File

@ -0,0 +1,14 @@
package docspell.text.date
import docspell.common.Language
import docspell.text.TestFiles
import minitest._
object DateFindSpec extends SimpleTestSuite {
test("find simple dates") {
//println(DateFind.findDates(TestFiles.letterDEText, Language.German).toVector)
println(DateFind.findDates(TestFiles.letterENText, Language.English).toVector)
}
}

View File

@ -0,0 +1,52 @@
package docspell.text.nlp
import docspell.common.{Language, NerLabel, NerTag}
import docspell.text.TestFiles
import minitest.SimpleTestSuite
object TextAnalyserSuite extends SimpleTestSuite {
test("find english ner labels") {
val labels = StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
val expect = Vector(NerLabel("Derek",NerTag.Person,0,5)
, NerLabel("Jeter",NerTag.Person,6,11)
, NerLabel("Treesville",NerTag.Person,27,37)
, NerLabel("Derek",NerTag.Person,69,74)
, NerLabel("Jeter",NerTag.Person,75,80)
, NerLabel("Treesville",NerTag.Location,96,106)
, NerLabel("M.",NerTag.Person,142,144)
, NerLabel("Leat",NerTag.Person,145,149)
, NerLabel("Syrup",NerTag.Organization,160,165)
, NerLabel("Production",NerTag.Organization,166,176)
, NerLabel("Old",NerTag.Organization,177,180)
, NerLabel("Sticky",NerTag.Organization,181,187)
, NerLabel("Pancake",NerTag.Organization,188,195)
, NerLabel("Company",NerTag.Organization,196,203)
, NerLabel("Maple",NerTag.Location,208,213)
, NerLabel("Lane",NerTag.Location,214,218)
, NerLabel("Forest",NerTag.Location,220,226)
, NerLabel("Hemptown",NerTag.Location,241,249)
, NerLabel("Little",NerTag.Organization,349,355)
, NerLabel("League",NerTag.Organization,356,362)
, NerLabel("Derek",NerTag.Person,1119,1124)
, NerLabel("Jeter",NerTag.Person,1125,1130))
assertEquals(labels, expect)
}
test("find german ner labels") {
val labels = StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
val expect = Vector(NerLabel("Max", NerTag.Person, 0, 3)
, NerLabel("Mustermann", NerTag.Person, 4, 14)
, NerLabel("Lilienweg", NerTag.Location, 16, 25)
, NerLabel("Max", NerTag.Person, 77, 80)
, NerLabel("Mustermann", NerTag.Person, 81, 91)
, NerLabel("Lilienweg", NerTag.Location, 93, 102)
, NerLabel("EasyCare", NerTag.Organization, 124, 132)
, NerLabel("AG", NerTag.Organization, 133, 135)
, NerLabel("Ackerweg", NerTag.Location, 158, 166)
, NerLabel("Nebendorf", NerTag.Location, 184, 193)
, NerLabel("Max", NerTag.Person, 505, 508)
, NerLabel("Mustermann", NerTag.Person, 509, 519))
assertEquals(labels, expect)
}
}

View File

@ -0,0 +1,25 @@
package docspell.text.ocr
import cats.effect.IO
import docspell.text.TestFiles
import minitest.SimpleTestSuite
object TextExtractionSuite extends SimpleTestSuite {
import TestFiles._
test("extract english pdf") {
ignore()
val text = TextExtract.extract[IO](letterSourceEN, blocker, "eng", Config.default).
compile.lastOrError.unsafeRunSync()
println(text)
}
test("extract german pdf") {
ignore()
val expect = TestFiles.letterDEText
val extract = TextExtract.extract[IO](letterSourceDE, blocker, "deu", Config.default).
compile.lastOrError.unsafeRunSync()
assertEquals(extract.trim, expect.trim)
}
}

View File

@ -0,0 +1,24 @@
package docspell.text.split
import minitest._
object TestSplitterSpec extends SimpleTestSuite {
test("simple splitting") {
val text = """hiermit kündige ich meine Mitgliedschaft in der Kranken- und
|Pflegeversicherung zum nächstmöglichen Termin.
|
|Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen
|eine Kündigungsbestätigung zu.
|
|Vielen Dank im Vorraus!""".stripMargin
val words = TextSplitter.splitToken(text, " \t\r\n".toSet).toVector
assertEquals(words.size, 31)
assertEquals(words(13), Word("bitte", 109, 114))
assertEquals(text.substring(109, 114).toLowerCase, "bitte")
}
}