Reorganize processing code

Use separate modules for - text extraction - conversion to pdf - text analysis
2025-09-15 21:46:53 +00:00 · 2020-02-15 16:40:50 +01:00
parent 919381be1e
commit 851ee7ef0f
24 changed files with 103 additions and 60 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala
@@ -0,0 +1,56 @@
+package docspell.analysis.contact
+
+import fs2.Stream
+import cats.implicits._
+
+import docspell.common._
+import docspell.analysis.split._
+
+object Contact {
+  private[this] val protocols = Set("ftp", "http", "https")
+
+  def annotate(text: String): Vector[NerLabel] =
+    TextSplitter
+      .splitToken[Nothing](text, " \t\r\n".toSet)
+      .map({ token =>
+        if (isEmailAddress(token.value))
+          NerLabel(token.value, NerTag.Email, token.begin, token.end).some
+        else if (isWebsite(token.value))
+          NerLabel(token.value, NerTag.Website, token.begin, token.end).some
+        else None
+      })
+      .flatMap(_.map(Stream.emit).getOrElse(Stream.empty))
+      .toVector
+
+  def isEmailAddress(str: String): Boolean = {
+    val atIdx = str.indexOf('@')
+    if (atIdx <= 0 || str.indexOf('@', atIdx + 1) > 0) false
+    else {
+      val name = str.substring(0, atIdx)
+      val dom  = str.substring(atIdx + 1)
+      Domain.isDomain(dom) && name.forall(c => !c.isWhitespace)
+    }
+  }
+
+  def isWebsite(str: String): Boolean =
+    LenientUri
+      .parse(str)
+      .toOption
+      .map(uri => protocols.contains(uri.scheme.head))
+      .getOrElse(Domain.isDomain(str))
+
+  def isDocspellOpenUpload(str: String): Boolean = {
+    def isUploadPath(p: LenientUri.Path): Boolean =
+      p match {
+        case LenientUri.RootPath  => false
+        case LenientUri.EmptyPath => false
+        case LenientUri.NonEmptyPath(segs) =>
+          Ident.fromString(segs.last).isRight &&
+            segs.init.takeRight(3) == List("open", "upload", "item")
+      }
+    LenientUri
+      .parse(str)
+      .toOption
+      .exists(uri => protocols.contains(uri.scheme.head) && isUploadPath(uri.path))
+  }
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala
@@ -0,0 +1,44 @@
+package docspell.analysis.contact
+
+import cats.data.NonEmptyList
+import docspell.common.LenientUri
+
+case class Domain(labels: NonEmptyList[String], tld: String) {
+
+  def asString: String =
+    labels.toList.mkString(".") + tld
+
+  def toPrimaryDomain: Domain =
+    if (labels.tail.isEmpty) this
+    else Domain(NonEmptyList.of(labels.last), tld)
+}
+
+object Domain {
+
+  def domainFromUri(uri: String): Either[String, Domain] =
+    LenientUri
+      .parse(if (uri.contains("://")) uri else s"http://$uri")
+      .flatMap(uri => uri.authority.toRight("Uri has no authoriry part"))
+      .flatMap(auth => parse(auth))
+
+  def parse(str: String): Either[String, Domain] =
+    Tld
+      .findTld(str)
+      .map(tld => (str.dropRight(tld.length), tld))
+      .map({
+        case (names, tld) =>
+          names.split('.').toList match {
+            case Nil => Left(s"Not a domain: $str")
+            case segs
+                if segs.forall(label =>
+                  label.trim.nonEmpty && label.forall(c => c.isLetter || c.isDigit || c == '-')
+                ) =>
+              Right(Domain(NonEmptyList.fromListUnsafe(segs), tld))
+            case _ => Left(s"Not a domain: $str")
+          }
+      })
+      .getOrElse(Left(s"Not a domain $str"))
+
+  def isDomain(str: String): Boolean =
+    parse(str).isRight
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala
@@ -0,0 +1,83 @@
+package docspell.analysis.contact
+
+private[analysis] object Tld {
+
+  def findTld(str: String): Option[String] =
+    known.find(str.endsWith)
+
+  def endsWithTld(str: String): Boolean =
+    findTld(str).isDefined
+
+  /**
+    * Some selected TLDs.
+    */
+  private[this] val known = List(
+    ".com",
+    ".org",
+    ".net",
+    ".int",
+    ".edu",
+    ".gov",
+    ".mil",
+    ".ad",
+    ".ae",
+    ".al",
+    ".am",
+    ".ar",
+    ".as",
+    ".at",
+    ".au",
+    ".ax",
+    ".ba",
+    ".bd",
+    ".be",
+    ".bg",
+    ".br",
+    ".by",
+    ".bz",
+    ".ca",
+    ".cc",
+    ".ch",
+    ".cn",
+    ".co",
+    ".cu",
+    ".cx",
+    ".cy",
+    ".de",
+    ".dk",
+    ".dj",
+    ".ee",
+    ".eu",
+    ".fi",
+    ".fr",
+    ".gr",
+    ".hk",
+    ".hr",
+    ".hu",
+    ".ie",
+    ".il",
+    ".io",
+    ".is",
+    ".ir",
+    ".it",
+    ".jp",
+    ".li",
+    ".lt",
+    ".mt",
+    ".no",
+    ".nz",
+    ".pl",
+    ".pt",
+    ".ru",
+    ".rs",
+    ".se",
+    ".si",
+    ".sk",
+    ".th",
+    ".ua",
+    ".uk",
+    ".us",
+    ".ws"
+  )
+
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@@ -0,0 +1,132 @@
+package docspell.analysis.date
+
+import java.time.LocalDate
+
+import fs2.{Pure, Stream}
+import docspell.common._
+import docspell.analysis.split._
+
+import scala.util.Try
+
+object DateFind {
+
+  def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] =
+    TextSplitter
+      .splitToken(text, " \t.,\n\r/".toSet)
+      .sliding(3)
+      .filter(_.length == 3)
+      .map(q =>
+        SimpleDate
+          .fromParts(q.toList, lang)
+          .map(sd =>
+            NerDateLabel(
+              sd.toLocalDate,
+              NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end)
+            )
+          )
+      )
+      .collect({ case Some(d) => d })
+
+  private case class SimpleDate(year: Int, month: Int, day: Int) {
+    def toLocalDate: LocalDate =
+      LocalDate.of(if (year < 100) 1900 + year else year, month, day)
+  }
+
+  private object SimpleDate {
+    val p0 = (readYear >> readMonth >> readDay).map {
+      case ((y, m), d) => SimpleDate(y, m, d)
+    }
+    val p1 = (readDay >> readMonth >> readYear).map {
+      case ((d, m), y) => SimpleDate(y, m, d)
+    }
+    val p2 = (readMonth >> readDay >> readYear).map {
+      case ((m, d), y) => SimpleDate(y, m, d)
+    }
+
+    // ymd ✔, ydm, dmy ✔, dym, myd, mdy ✔
+    def fromParts(parts: List[Word], lang: Language): Option[SimpleDate] = {
+      val p = lang match {
+        case Language.English => p2.or(p0).or(p1)
+        case Language.German  => p1.or(p0).or(p2)
+      }
+      p.read(parts).toOption
+    }
+
+    def readYear: Reader[Int] =
+      Reader.readFirst(w =>
+        w.value.length match {
+          case 2 => Try(w.value.toInt).filter(n => n >= 0).toOption
+          case 4 => Try(w.value.toInt).filter(n => n > 1000).toOption
+          case _ => None
+        }
+      )
+
+    def readMonth: Reader[Int] =
+      Reader.readFirst(w => Some(months.indexWhere(_.contains(w.value))).filter(_ > 0).map(_ + 1))
+
+    def readDay: Reader[Int] =
+      Reader.readFirst(w => Try(w.value.toInt).filter(n => n > 0 && n <= 31).toOption)
+
+    case class Reader[A](read: List[Word] => Result[A]) {
+      def >>[B](next: Reader[B]): Reader[(A, B)] =
+        Reader(read.andThen(_.next(next)))
+
+      def map[B](f: A => B): Reader[B] =
+        Reader(read.andThen(_.map(f)))
+
+      def or(other: Reader[A]): Reader[A] =
+        Reader(words =>
+          read(words) match {
+            case Result.Failure           => other.read(words)
+            case s @ Result.Success(_, _) => s
+          }
+        )
+    }
+
+    object Reader {
+      def fail[A]: Reader[A] =
+        Reader(_ => Result.Failure)
+
+      def readFirst[A](f: Word => Option[A]): Reader[A] =
+        Reader({
+          case Nil     => Result.Failure
+          case a :: as => f(a).map(value => Result.Success(value, as)).getOrElse(Result.Failure)
+        })
+    }
+
+    sealed trait Result[+A] {
+      def toOption: Option[A]
+      def map[B](f: A => B): Result[B]
+      def next[B](r: Reader[B]): Result[(A, B)]
+    }
+
+    object Result {
+      final case class Success[A](value: A, rest: List[Word]) extends Result[A] {
+        val toOption                     = Some(value)
+        def map[B](f: A => B): Result[B] = Success(f(value), rest)
+        def next[B](r: Reader[B]): Result[(A, B)] =
+          r.read(rest).map(b => (value, b))
+      }
+      final case object Failure extends Result[Nothing] {
+        val toOption                                    = None
+        def map[B](f: Nothing => B): Result[B]          = this
+        def next[B](r: Reader[B]): Result[(Nothing, B)] = this
+      }
+    }
+
+    private val months = List(
+      List("jan", "january", "januar", "01"),
+      List("feb", "february", "februar", "02"),
+      List("mar", "march", "märz", "marz", "03"),
+      List("apr", "april", "04"),
+      List("may", "mai", "05"),
+      List("jun", "june", "juni", "06"),
+      List("jul", "july", "juli", "07"),
+      List("aug", "august", "08"),
+      List("sep", "september", "09"),
+      List("oct", "october", "oktober", "10"),
+      List("nov", "november", "11"),
+      List("dec", "december", "dezember", "12")
+    )
+  }
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala
@@ -0,0 +1,63 @@
+package docspell.analysis.nlp
+
+import java.net.URL
+import java.util.zip.GZIPInputStream
+
+import edu.stanford.nlp.ie.AbstractSequenceClassifier
+import edu.stanford.nlp.ie.crf.CRFClassifier
+import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel}
+import org.log4s.getLogger
+
+import docspell.common._
+
+import scala.util.Using
+import scala.jdk.CollectionConverters._
+
+object StanfordNerClassifier {
+  private[this] val logger = getLogger
+
+  lazy val germanNerClassifier  = makeClassifier(Language.German)
+  lazy val englishNerClassifier = makeClassifier(Language.English)
+
+  def nerAnnotate(lang: Language)(text: String): Vector[NerLabel] = {
+    val nerClassifier = lang match {
+      case Language.English => englishNerClassifier
+      case Language.German  => germanNerClassifier
+    }
+    nerClassifier
+      .classify(text)
+      .asScala
+      .flatMap(a => a.asScala)
+      .collect(Function.unlift { label =>
+        val tag = label.get(classOf[CoreAnnotations.AnswerAnnotation])
+        NerTag
+          .fromString(Option(tag).getOrElse(""))
+          .toOption
+          .map(t => NerLabel(label.word(), t, label.beginPosition(), label.endPosition()))
+      })
+      .toVector
+  }
+
+  private def makeClassifier(lang: Language): AbstractSequenceClassifier[CoreLabel] = {
+    logger.info(s"Creating ${lang.name} Stanford NLP NER classifier...")
+    val ner = classifierResource(lang)
+    Using(new GZIPInputStream(ner.openStream())) { in =>
+      CRFClassifier.getClassifier(in).asInstanceOf[AbstractSequenceClassifier[CoreLabel]]
+    }.fold(throw _, identity)
+  }
+
+  private def classifierResource(lang: Language): URL = {
+    def check(u: URL): URL =
+      if (u == null) sys.error(s"NER model url not found for language ${lang.name}")
+      else u
+
+    check(lang match {
+      case Language.German =>
+        getClass.getResource(
+          "/edu/stanford/nlp/models/ner/german.conll.germeval2014.hgc_175m_600.crf.ser.gz"
+        )
+      case Language.English =>
+        getClass.getResource("/edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz")
+    })
+  }
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala
@@ -0,0 +1,26 @@
+package docspell.analysis.split
+
+import fs2.Stream
+
+/** Splits text into words.
+  *
+  */
+object TextSplitter {
+  private[this] val trimChars =
+    ".,…_[]^!<>=&ſ/{}*?()-:#$|~`+%\\\"'; \t\r\n".toSet
+
+  def split[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] = {
+    val indexes = sep.map(c => str.indexOf(c.toInt)).filter(_ >= 0)
+    val index   = if (indexes.isEmpty) -1 else indexes.min
+
+    if (index < 0) Stream.emit(Word(str, start, start + str.length))
+    else if (index == 0) split(str.substring(1), sep, start + 1)
+    else
+      Stream.emit(Word(str.substring(0, index), start, start + index)) ++
+        Stream.suspend(split(str.substring(index + 1), sep, start + index + 1))
+  }
+
+  def splitToken[F[_]](str: String, sep: Set[Char], start: Int = 0): Stream[F, Word] =
+    split(str, sep, start).map(w => w.trim(trimChars)).filter(_.nonEmpty).map(_.toLower)
+
+}
--- a/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala
@@ -0,0 +1,32 @@
+package docspell.analysis.split
+
+case class Word(value: String, begin: Int, end: Int) {
+  def isEmpty: Boolean  = value.isEmpty
+  def nonEmpty: Boolean = !isEmpty
+  def length: Int       = value.length
+
+  def trimLeft(chars: Set[Char]): Word = {
+    val v = value.dropWhile(chars.contains)
+    if (v == value) this
+    else Word(v, begin + length - v.length, end)
+  }
+
+  def trimRight(chars: Set[Char]): Word = {
+    @annotation.tailrec
+    def findIndex(n: Int = length - 1): Int =
+      if (n < 0 || !chars.contains(value.charAt(n))) n
+      else findIndex(n - 1)
+
+    val index = findIndex()
+    if (index == length - 1) this
+    else if (index < 0) Word("", begin, begin + 1)
+    else Word(value.substring(0, index + 1), begin, end - index)
+  }
+
+  def trim(chars: Set[Char]): Word =
+    trimLeft(chars).trimRight(chars)
+
+  def toLower: Word =
+    copy(value = value.toLowerCase)
+
+}
--- a/modules/analysis/src/test/scala/docspell/analysis/TestFiles.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/TestFiles.scala
@@ -0,0 +1,21 @@
+package docspell.analysis
+
+import cats.effect.{Blocker, IO}
+import docspell.files._
+
+import scala.concurrent.ExecutionContext
+
+object TestFiles {
+  val blocker     = Blocker.liftExecutionContext(ExecutionContext.global)
+  implicit val CS = IO.contextShift(ExecutionContext.global)
+
+  lazy val letterDEText =
+    ExampleFiles.letter_de_txt
+      .readText[IO](16 * 1024, blocker)
+      .unsafeRunSync
+
+  lazy val letterENText =
+    ExampleFiles.letter_en_txt
+      .readText[IO](16 * 1024, blocker)
+      .unsafeRunSync
+}
--- a/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala
@@ -0,0 +1,30 @@
+package docspell.analysis.contact
+
+import docspell.common.{NerLabel, NerTag}
+import minitest.SimpleTestSuite
+
+object ContactAnnotateSpec extends SimpleTestSuite {
+
+  test("find email") {
+
+    val text =
+      """An email address such as John.Smith@example.com is made up
+        |of a local-part, an @ symbol, then a case-insensitive domain.
+        |Although the standard requires[1] the local part to be
+        |case-sensitive, it also urges that receiving hosts deliver
+        |messages in a case-independent fashion,[2] e.g., that the mail
+        |system at example.com treat John.Smith as equivalent to
+        |john.smith; some mail systems even treat them as equivalent
+        |to johnsmith.[3] Mail systems often limit their users' choice
+        |of name to a subset of the technically valid characters, and
+        |in some cases also limit which addresses it is possible to
+        |send mail to.""".stripMargin
+
+    val labels = Contact.annotate(text)
+    assertEquals(labels.size, 2)
+    assertEquals(labels(0), NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
+    assertEquals(text.substring(25, 47).toLowerCase, "john.smith@example.com")
+    assertEquals(labels(1), NerLabel("example.com", NerTag.Website, 308, 319))
+    assertEquals(text.substring(308, 319).toLowerCase, "example.com")
+  }
+}
--- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala
@@ -0,0 +1,14 @@
+package docspell.analysis.date
+
+import docspell.analysis.TestFiles
+import minitest.SimpleTestSuite
+import docspell.common.Language
+
+object DateFindSpec extends SimpleTestSuite {
+
+  test("find simple dates") {
+
+    //println(DateFind.findDates(TestFiles.letterDEText, Language.German).toVector)
+    println(DateFind.findDates(TestFiles.letterENText, Language.English).toVector)
+  }
+}
--- a/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala
@@ -0,0 +1,56 @@
+package docspell.analysis.nlp
+
+import minitest.SimpleTestSuite
+import docspell.analysis.TestFiles
+import docspell.common._
+
+object TextAnalyserSuite extends SimpleTestSuite {
+
+  test("find english ner labels") {
+    val labels = StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
+    val expect = Vector(
+      NerLabel("Derek", NerTag.Person, 0, 5),
+      NerLabel("Jeter", NerTag.Person, 6, 11),
+      NerLabel("Treesville", NerTag.Person, 27, 37),
+      NerLabel("Derek", NerTag.Person, 69, 74),
+      NerLabel("Jeter", NerTag.Person, 75, 80),
+      NerLabel("Treesville", NerTag.Location, 96, 106),
+      NerLabel("M.", NerTag.Person, 142, 144),
+      NerLabel("Leat", NerTag.Person, 145, 149),
+      NerLabel("Syrup", NerTag.Organization, 160, 165),
+      NerLabel("Production", NerTag.Organization, 166, 176),
+      NerLabel("Old", NerTag.Organization, 177, 180),
+      NerLabel("Sticky", NerTag.Organization, 181, 187),
+      NerLabel("Pancake", NerTag.Organization, 188, 195),
+      NerLabel("Company", NerTag.Organization, 196, 203),
+      NerLabel("Maple", NerTag.Location, 208, 213),
+      NerLabel("Lane", NerTag.Location, 214, 218),
+      NerLabel("Forest", NerTag.Location, 220, 226),
+      NerLabel("Hemptown", NerTag.Location, 241, 249),
+      NerLabel("Little", NerTag.Organization, 349, 355),
+      NerLabel("League", NerTag.Organization, 356, 362),
+      NerLabel("Derek", NerTag.Person, 1119, 1124),
+      NerLabel("Jeter", NerTag.Person, 1125, 1130)
+    )
+    assertEquals(labels, expect)
+  }
+
+  test("find german ner labels") {
+    val labels = StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
+    val expect = Vector(
+      NerLabel("Max", NerTag.Person, 0, 3),
+      NerLabel("Mustermann", NerTag.Person, 4, 14),
+      NerLabel("Lilienweg", NerTag.Location, 16, 25),
+      NerLabel("Max", NerTag.Person, 77, 80),
+      NerLabel("Mustermann", NerTag.Person, 81, 91),
+      NerLabel("Lilienweg", NerTag.Location, 93, 102),
+      NerLabel("EasyCare", NerTag.Organization, 124, 132),
+      NerLabel("AG", NerTag.Organization, 133, 135),
+      NerLabel("Ackerweg", NerTag.Location, 158, 166),
+      NerLabel("Nebendorf", NerTag.Location, 184, 193),
+      NerLabel("Max", NerTag.Person, 505, 508),
+      NerLabel("Mustermann", NerTag.Person, 509, 519)
+    )
+    assertEquals(labels, expect)
+  }
+}
--- a/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala
@@ -0,0 +1,23 @@
+package docspell.analysis.split
+
+import minitest.SimpleTestSuite
+
+object TestSplitterSpec extends SimpleTestSuite {
+
+  test("simple splitting") {
+    val text = """hiermit kündige ich meine Mitgliedschaft in der Kranken- und
+                 |Pflegeversicherung zum nächstmöglichen Termin.
+                 |
+                 |Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen
+                 |eine Kündigungsbestätigung zu.
+                 |
+                 |Vielen Dank im Vorraus!""".stripMargin
+
+    val words = TextSplitter.splitToken(text, " \t\r\n".toSet).toVector
+
+    assertEquals(words.size, 31)
+    assertEquals(words(13), Word("bitte", 109, 114))
+    assertEquals(text.substring(109, 114).toLowerCase, "bitte")
+  }
+
+}