Reorganize processing code

Use separate modules for

- text extraction
- conversion to pdf
- text analysis
This commit is contained in:
Eike Kettner
2020-02-15 16:40:50 +01:00
parent 919381be1e
commit 851ee7ef0f
24 changed files with 103 additions and 60 deletions

View File

@ -0,0 +1,56 @@
package docspell.analysis.contact
import fs2.Stream
import cats.implicits._
import docspell.common._
import docspell.analysis.split._
object Contact {
private[this] val protocols = Set("ftp", "http", "https")
def annotate(text: String): Vector[NerLabel] =
TextSplitter
.splitToken[Nothing](text, " \t\r\n".toSet)
.map({ token =>
if (isEmailAddress(token.value))
NerLabel(token.value, NerTag.Email, token.begin, token.end).some
else if (isWebsite(token.value))
NerLabel(token.value, NerTag.Website, token.begin, token.end).some
else None
})
.flatMap(_.map(Stream.emit).getOrElse(Stream.empty))
.toVector
def isEmailAddress(str: String): Boolean = {
val atIdx = str.indexOf('@')
if (atIdx <= 0 || str.indexOf('@', atIdx + 1) > 0) false
else {
val name = str.substring(0, atIdx)
val dom = str.substring(atIdx + 1)
Domain.isDomain(dom) && name.forall(c => !c.isWhitespace)
}
}
def isWebsite(str: String): Boolean =
LenientUri
.parse(str)
.toOption
.map(uri => protocols.contains(uri.scheme.head))
.getOrElse(Domain.isDomain(str))
def isDocspellOpenUpload(str: String): Boolean = {
def isUploadPath(p: LenientUri.Path): Boolean =
p match {
case LenientUri.RootPath => false
case LenientUri.EmptyPath => false
case LenientUri.NonEmptyPath(segs) =>
Ident.fromString(segs.last).isRight &&
segs.init.takeRight(3) == List("open", "upload", "item")
}
LenientUri
.parse(str)
.toOption
.exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path))
}
}

View File

@ -0,0 +1,44 @@
package docspell.analysis.contact
import cats.data.NonEmptyList
import docspell.common.LenientUri
case class Domain(labels: NonEmptyList[String], tld: String) {
def asString: String =
labels.toList.mkString(".") + tld
def toPrimaryDomain: Domain =
if (labels.tail.isEmpty) this
else Domain(NonEmptyList.of(labels.last), tld)
}
object Domain {
def domainFromUri(uri: String): Either[String, Domain] =
LenientUri
.parse(if (uri.contains("://")) uri else s"http://$uri")
.flatMap(uri => uri.authority.toRight("Uri has no authoriry part"))
.flatMap(auth => parse(auth))
def parse(str: String): Either[String, Domain] =
Tld
.findTld(str)
.map(tld => (str.dropRight(tld.length), tld))
.map({
case (names, tld) =>
names.split('.').toList match {
case Nil => Left(s"Not a domain: $str")
case segs
if segs.forall(label =>
label.trim.nonEmpty && label.forall(c => c.isLetter || c.isDigit || c == '-')
) =>
Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
case _ => Left(s"Not a domain: $str")
}
})
.getOrElse(Left(s"Not a domain $str"))
def isDomain(str: String): Boolean =
parse(str).isRight
}

View File

@ -0,0 +1,83 @@
package docspell.analysis.contact
private[analysis] object Tld {
def findTld(str: String): Option[String] =
known.find(str.endsWith)
def endsWithTld(str: String): Boolean =
findTld(str).isDefined
/**
* Some selected TLDs.
*/
private[this] val known = List(
".com",
".org",
".net",
".int",
".edu",
".gov",
".mil",
".ad",
".ae",
".al",
".am",
".ar",
".as",
".at",
".au",
".ax",
".ba",
".bd",
".be",
".bg",
".br",
".by",
".bz",
".ca",
".cc",
".ch",
".cn",
".co",
".cu",
".cx",
".cy",
".de",
".dk",
".dj",
".ee",
".eu",
".fi",
".fr",
".gr",
".hk",
".hr",
".hu",
".ie",
".il",
".io",
".is",
".ir",
".it",
".jp",
".li",
".lt",
".mt",
".no",
".nz",
".pl",
".pt",
".ru",
".rs",
".se",
".si",
".sk",
".th",
".ua",
".uk",
".us",
".ws"
)
}

View File

@ -0,0 +1,132 @@
package docspell.analysis.date
import java.time.LocalDate
import fs2.{Pure, Stream}
import docspell.common._
import docspell.analysis.split._
import scala.util.Try
object DateFind {
def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
TextSplitter
.splitToken(text, " \t.,\n\r/".toSet)
.sliding(3)
.filter(_.length == 3)
.map(q =>
SimpleDate
.fromParts(q.toList, lang)
.map(sd =>
NerDateLabel(
sd.toLocalDate,
NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end)
)
)
)
.collect({ case Some(d) => d })
private case class SimpleDate(year: Int, month: Int, day: Int) {
def toLocalDate: LocalDate =
LocalDate.of(if (year < 100) 1900 + year else year, month, day)
}
private object SimpleDate {
val p0 = (readYear >> readMonth >> readDay).map {
case ((y, m), d) => SimpleDate(y, m, d)
}
val p1 = (readDay >> readMonth >> readYear).map {
case ((d, m), y) => SimpleDate(y, m, d)
}
val p2 = (readMonth >> readDay >> readYear).map {
case ((m, d), y) => SimpleDate(y, m, d)
}
// ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔
def fromParts(parts: List[Word], lang: Language): Option[SimpleDate] = {
val p = lang match {
case Language.English => p2.or(p0).or(p1)
case Language.German => p1.or(p0).or(p2)
}
p.read(parts).toOption
}
def readYear: Reader[Int] =
Reader.readFirst(w =>
w.value.length match {
case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption
case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption
case _ => None
}
)
def readMonth: Reader[Int] =
Reader.readFirst(w => Some(months.indexWhere(_.contains(w.value))).filter(_ > 0).map(_ + 1))
def readDay: Reader[Int] =
Reader.readFirst(w => Try(w.value.toInt).filter(n => n > 0 && n <= 31).toOption)
case class Reader[A](read: List[Word] => Result[A]) {
def >>[B](next: Reader[B]): Reader[(A, B)] =
Reader(read.andThen(_.next(next)))
def map[B](f: A => B): Reader[B] =
Reader(read.andThen(_.map(f)))
def or(other: Reader[A]): Reader[A] =
Reader(words =>
read(words) match {
case Result.Failure => other.read(words)
case s @ Result.Success(_, _) => s
}
)
}
object Reader {
def fail[A]: Reader[A] =
Reader(_ => Result.Failure)
def readFirst[A](f: Word => Option[A]): Reader[A] =
Reader({
case Nil => Result.Failure
case a :: as => f(a).map(value => Result.Success(value, as)).getOrElse(Result.Failure)
})
}
sealed trait Result[+A] {
def toOption: Option[A]
def map[B](f: A => B): Result[B]
def next[B](r: Reader[B]): Result[(A, B)]
}
object Result {
final case class Success[A](value: A, rest: List[Word]) extends Result[A] {
val toOption = Some(value)
def map[B](f: A => B): Result[B] = Success(f(value), rest)
def next[B](r: Reader[B]): Result[(A, B)] =
r.read(rest).map(b => (value, b))
}
final case object Failure extends Result[Nothing] {
val toOption = None
def map[B](f: Nothing => B): Result[B] = this
def next[B](r: Reader[B]): Result[(Nothing, B)] = this
}
}
private val months = List(
List("jan", "january", "januar", "01"),
List("feb", "february", "februar", "02"),
List("mar", "march", "märz", "marz", "03"),
List("apr", "april", "04"),
List("may", "mai", "05"),
List("jun", "june", "juni", "06"),
List("jul", "july", "juli", "07"),
List("aug", "august", "08"),
List("sep", "september", "09"),
List("oct", "october", "oktober", "10"),
List("nov", "november", "11"),
List("dec", "december", "dezember", "12")
)
}
}

View File

@ -0,0 +1,63 @@
package docspell.analysis.nlp
import java.net.URL
import java.util.zip.GZIPInputStream
import edu.stanford.nlp.ie.AbstractSequenceClassifier
import edu.stanford.nlp.ie.crf.CRFClassifier
import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
import org.log4s.getLogger
import docspell.common._
import scala.util.Using
import scala.jdk.CollectionConverters._
object StanfordNerClassifier {
private[this] val logger = getLogger
lazy val germanNerClassifier = makeClassifier(Language.German)
lazy val englishNerClassifier = makeClassifier(Language.English)
def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
val nerClassifier = lang match {
case Language.English => englishNerClassifier
case Language.German => germanNerClassifier
}
nerClassifier
.classify(text)
.asScala
.flatMap(a => a.asScala)
.collect(Function.unlift { label =>
val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
NerTag
.fromString(Option(tag).getOrElse(""))
.toOption
.map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
})
.toVector
}
private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
val ner = classifierResource(lang)
Using(new GZIPInputStream(ner.openStream())) { in =>
CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
}.fold(throw _, identity)
}
private def classifierResource(lang: Language): URL = {
def check(u: URL): URL =
if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
else u
check(lang match {
case Language.German =>
getClass.getResource(
"/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
)
case Language.English =>
getClass.getResource("/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz")
})
}
}

View File

@ -0,0 +1,26 @@
package docspell.analysis.split
import fs2.Stream
/** Splits text into words.
*
*/
object TextSplitter {
private[this] val trimChars =
".,…_[]^!<>=&ſ/{}*?()-:#$|~`+%\\\"'; \t\r\n".toSet
def split[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
val indexes = sep.map(c => str.indexOf(c.toInt)).filter(_ >= 0)
val index = if (indexes.isEmpty) -1 else indexes.min
if (index < 0) Stream.emit(Word(str, start, start + str.length))
else if (index == 0) split(str.substring(1), sep, start + 1)
else
Stream.emit(Word(str.substring(0, index), start, start + index)) ++
Stream.suspend(split(str.substring(index + 1), sep, start + index + 1))
}
def splitToken[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] =
split(str, sep, start).map(w => w.trim(trimChars)).filter(_.nonEmpty).map(_.toLower)
}

View File

@ -0,0 +1,32 @@
package docspell.analysis.split
case class Word(value: String, begin: Int, end: Int) {
def isEmpty: Boolean = value.isEmpty
def nonEmpty: Boolean = !isEmpty
def length: Int = value.length
def trimLeft(chars: Set[Char]): Word = {
val v = value.dropWhile(chars.contains)
if (v == value) this
else Word(v, begin + length - v.length, end)
}
def trimRight(chars: Set[Char]): Word = {
@annotation.tailrec
def findIndex(n: Int = length - 1): Int =
if (n < 0 || !chars.contains(value.charAt(n))) n
else findIndex(n - 1)
val index = findIndex()
if (index == length - 1) this
else if (index < 0) Word("", begin, begin + 1)
else Word(value.substring(0, index + 1), begin, end - index)
}
def trim(chars: Set[Char]): Word =
trimLeft(chars).trimRight(chars)
def toLower: Word =
copy(value = value.toLowerCase)
}

View File

@ -0,0 +1,21 @@
package docspell.analysis
import cats.effect.{Blocker, IO}
import docspell.files._
import scala.concurrent.ExecutionContext
object TestFiles {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
lazy val letterDEText =
ExampleFiles.letter_de_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
lazy val letterENText =
ExampleFiles.letter_en_txt
.readText[IO](16 * 1024, blocker)
.unsafeRunSync
}

View File

@ -0,0 +1,30 @@
package docspell.analysis.contact
import docspell.common.{NerLabel, NerTag}
import minitest.SimpleTestSuite
object ContactAnnotateSpec extends SimpleTestSuite {
test("find email") {
val text =
"""An email address such as John.Smith@example.com is made up
|of a local-part, an @ symbol, then a case-insensitive domain.
|Although the standard requires[1] the local part to be
|case-sensitive, it also urges that receiving hosts deliver
|messages in a case-independent fashion,[2] e.g., that the mail
|system at example.com treat John.Smith as equivalent to
|john.smith; some mail systems even treat them as equivalent
|to johnsmith.[3] Mail systems often limit their users' choice
|of name to a subset of the technically valid characters, and
|in some cases also limit which addresses it is possible to
|send mail to.""".stripMargin
val labels = Contact.annotate(text)
assertEquals(labels.size, 2)
assertEquals(labels(0), NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
assertEquals(text.substring(25, 47).toLowerCase, "john.smith@example.com")
assertEquals(labels(1), NerLabel("example.com", NerTag.Website, 308, 319))
assertEquals(text.substring(308, 319).toLowerCase, "example.com")
}
}

View File

@ -0,0 +1,14 @@
package docspell.analysis.date
import docspell.analysis.TestFiles
import minitest.SimpleTestSuite
import docspell.common.Language
object DateFindSpec extends SimpleTestSuite {
test("find simple dates") {
//println(DateFind.findDates(TestFiles.letterDEText, Language.German).toVector)
println(DateFind.findDates(TestFiles.letterENText, Language.English).toVector)
}
}

View File

@ -0,0 +1,56 @@
package docspell.analysis.nlp
import minitest.SimpleTestSuite
import docspell.analysis.TestFiles
import docspell.common._
object TextAnalyserSuite extends SimpleTestSuite {
test("find english ner labels") {
val labels = StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
val expect = Vector(
NerLabel("Derek", NerTag.Person, 0, 5),
NerLabel("Jeter", NerTag.Person, 6, 11),
NerLabel("Treesville", NerTag.Person, 27, 37),
NerLabel("Derek", NerTag.Person, 69, 74),
NerLabel("Jeter", NerTag.Person, 75, 80),
NerLabel("Treesville", NerTag.Location, 96, 106),
NerLabel("M.", NerTag.Person, 142, 144),
NerLabel("Leat", NerTag.Person, 145, 149),
NerLabel("Syrup", NerTag.Organization, 160, 165),
NerLabel("Production", NerTag.Organization, 166, 176),
NerLabel("Old", NerTag.Organization, 177, 180),
NerLabel("Sticky", NerTag.Organization, 181, 187),
NerLabel("Pancake", NerTag.Organization, 188, 195),
NerLabel("Company", NerTag.Organization, 196, 203),
NerLabel("Maple", NerTag.Location, 208, 213),
NerLabel("Lane", NerTag.Location, 214, 218),
NerLabel("Forest", NerTag.Location, 220, 226),
NerLabel("Hemptown", NerTag.Location, 241, 249),
NerLabel("Little", NerTag.Organization, 349, 355),
NerLabel("League", NerTag.Organization, 356, 362),
NerLabel("Derek", NerTag.Person, 1119, 1124),
NerLabel("Jeter", NerTag.Person, 1125, 1130)
)
assertEquals(labels, expect)
}
test("find german ner labels") {
val labels = StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
val expect = Vector(
NerLabel("Max", NerTag.Person, 0, 3),
NerLabel("Mustermann", NerTag.Person, 4, 14),
NerLabel("Lilienweg", NerTag.Location, 16, 25),
NerLabel("Max", NerTag.Person, 77, 80),
NerLabel("Mustermann", NerTag.Person, 81, 91),
NerLabel("Lilienweg", NerTag.Location, 93, 102),
NerLabel("EasyCare", NerTag.Organization, 124, 132),
NerLabel("AG", NerTag.Organization, 133, 135),
NerLabel("Ackerweg", NerTag.Location, 158, 166),
NerLabel("Nebendorf", NerTag.Location, 184, 193),
NerLabel("Max", NerTag.Person, 505, 508),
NerLabel("Mustermann", NerTag.Person, 509, 519)
)
assertEquals(labels, expect)
}
}

View File

@ -0,0 +1,23 @@
package docspell.analysis.split
import minitest.SimpleTestSuite
object TestSplitterSpec extends SimpleTestSuite {
test("simple splitting") {
val text = """hiermit kündige ich meine Mitgliedschaft in der Kranken- und
|Pflegeversicherung zum nächstmöglichen Termin.
|
|Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen
|eine Kündigungsbestätigung zu.
|
|Vielen Dank im Vorraus!""".stripMargin
val words = TextSplitter.splitToken(text, " \t\r\n".toSet).toVector
assertEquals(words.size, 31)
assertEquals(words(13), Word("bitte", 109, 114))
assertEquals(text.substring(109, 114).toLowerCase, "bitte")
}
}