mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 18:38:26 +00:00
Reorganize processing code
Use separate modules for - text extraction - conversion to pdf - text analysis
This commit is contained in:
@ -0,0 +1,21 @@
|
||||
package docspell.analysis
|
||||
|
||||
import cats.effect.{Blocker, IO}
|
||||
import docspell.files._
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
object TestFiles {
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
lazy val letterDEText =
|
||||
ExampleFiles.letter_de_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
|
||||
lazy val letterENText =
|
||||
ExampleFiles.letter_en_txt
|
||||
.readText[IO](16 * 1024, blocker)
|
||||
.unsafeRunSync
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
package docspell.analysis.contact
|
||||
|
||||
import docspell.common.{NerLabel, NerTag}
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object ContactAnnotateSpec extends SimpleTestSuite {
|
||||
|
||||
test("find email") {
|
||||
|
||||
val text =
|
||||
"""An email address such as John.Smith@example.com is made up
|
||||
|of a local-part, an @ symbol, then a case-insensitive domain.
|
||||
|Although the standard requires[1] the local part to be
|
||||
|case-sensitive, it also urges that receiving hosts deliver
|
||||
|messages in a case-independent fashion,[2] e.g., that the mail
|
||||
|system at example.com treat John.Smith as equivalent to
|
||||
|john.smith; some mail systems even treat them as equivalent
|
||||
|to johnsmith.[3] Mail systems often limit their users' choice
|
||||
|of name to a subset of the technically valid characters, and
|
||||
|in some cases also limit which addresses it is possible to
|
||||
|send mail to.""".stripMargin
|
||||
|
||||
val labels = Contact.annotate(text)
|
||||
assertEquals(labels.size, 2)
|
||||
assertEquals(labels(0), NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
|
||||
assertEquals(text.substring(25, 47).toLowerCase, "john.smith@example.com")
|
||||
assertEquals(labels(1), NerLabel("example.com", NerTag.Website, 308, 319))
|
||||
assertEquals(text.substring(308, 319).toLowerCase, "example.com")
|
||||
}
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
package docspell.analysis.date
|
||||
|
||||
import docspell.analysis.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.common.Language
|
||||
|
||||
object DateFindSpec extends SimpleTestSuite {
|
||||
|
||||
test("find simple dates") {
|
||||
|
||||
//println(DateFind.findDates(TestFiles.letterDEText, Language.German).toVector)
|
||||
println(DateFind.findDates(TestFiles.letterENText, Language.English).toVector)
|
||||
}
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
package docspell.analysis.nlp
|
||||
|
||||
import minitest.SimpleTestSuite
|
||||
import docspell.analysis.TestFiles
|
||||
import docspell.common._
|
||||
|
||||
object TextAnalyserSuite extends SimpleTestSuite {
|
||||
|
||||
test("find english ner labels") {
|
||||
val labels = StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
|
||||
val expect = Vector(
|
||||
NerLabel("Derek", NerTag.Person, 0, 5),
|
||||
NerLabel("Jeter", NerTag.Person, 6, 11),
|
||||
NerLabel("Treesville", NerTag.Person, 27, 37),
|
||||
NerLabel("Derek", NerTag.Person, 69, 74),
|
||||
NerLabel("Jeter", NerTag.Person, 75, 80),
|
||||
NerLabel("Treesville", NerTag.Location, 96, 106),
|
||||
NerLabel("M.", NerTag.Person, 142, 144),
|
||||
NerLabel("Leat", NerTag.Person, 145, 149),
|
||||
NerLabel("Syrup", NerTag.Organization, 160, 165),
|
||||
NerLabel("Production", NerTag.Organization, 166, 176),
|
||||
NerLabel("Old", NerTag.Organization, 177, 180),
|
||||
NerLabel("Sticky", NerTag.Organization, 181, 187),
|
||||
NerLabel("Pancake", NerTag.Organization, 188, 195),
|
||||
NerLabel("Company", NerTag.Organization, 196, 203),
|
||||
NerLabel("Maple", NerTag.Location, 208, 213),
|
||||
NerLabel("Lane", NerTag.Location, 214, 218),
|
||||
NerLabel("Forest", NerTag.Location, 220, 226),
|
||||
NerLabel("Hemptown", NerTag.Location, 241, 249),
|
||||
NerLabel("Little", NerTag.Organization, 349, 355),
|
||||
NerLabel("League", NerTag.Organization, 356, 362),
|
||||
NerLabel("Derek", NerTag.Person, 1119, 1124),
|
||||
NerLabel("Jeter", NerTag.Person, 1125, 1130)
|
||||
)
|
||||
assertEquals(labels, expect)
|
||||
}
|
||||
|
||||
test("find german ner labels") {
|
||||
val labels = StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
|
||||
val expect = Vector(
|
||||
NerLabel("Max", NerTag.Person, 0, 3),
|
||||
NerLabel("Mustermann", NerTag.Person, 4, 14),
|
||||
NerLabel("Lilienweg", NerTag.Location, 16, 25),
|
||||
NerLabel("Max", NerTag.Person, 77, 80),
|
||||
NerLabel("Mustermann", NerTag.Person, 81, 91),
|
||||
NerLabel("Lilienweg", NerTag.Location, 93, 102),
|
||||
NerLabel("EasyCare", NerTag.Organization, 124, 132),
|
||||
NerLabel("AG", NerTag.Organization, 133, 135),
|
||||
NerLabel("Ackerweg", NerTag.Location, 158, 166),
|
||||
NerLabel("Nebendorf", NerTag.Location, 184, 193),
|
||||
NerLabel("Max", NerTag.Person, 505, 508),
|
||||
NerLabel("Mustermann", NerTag.Person, 509, 519)
|
||||
)
|
||||
assertEquals(labels, expect)
|
||||
}
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
package docspell.analysis.split
|
||||
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TestSplitterSpec extends SimpleTestSuite {
|
||||
|
||||
test("simple splitting") {
|
||||
val text = """hiermit kündige ich meine Mitgliedschaft in der Kranken- und
|
||||
|Pflegeversicherung zum nächstmöglichen Termin.
|
||||
|
|
||||
|Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen
|
||||
|eine Kündigungsbestätigung zu.
|
||||
|
|
||||
|Vielen Dank im Vorraus!""".stripMargin
|
||||
|
||||
val words = TextSplitter.splitToken(text, " \t\r\n".toSet).toVector
|
||||
|
||||
assertEquals(words.size, 31)
|
||||
assertEquals(words(13), Word("bitte", 109, 114))
|
||||
assertEquals(text.substring(109, 114).toLowerCase, "bitte")
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user