mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-08-05 02:24:52 +00:00
Initial version.
Features: - Upload PDF files let them analyze - Manage meta data and items - See processing in webapp
This commit is contained in:
BIN
modules/text/src/test/resources/letter-de-source.pdf
Normal file
BIN
modules/text/src/test/resources/letter-de-source.pdf
Normal file
Binary file not shown.
BIN
modules/text/src/test/resources/letter-en-source.pdf
Normal file
BIN
modules/text/src/test/resources/letter-en-source.pdf
Normal file
Binary file not shown.
14
modules/text/src/test/resources/logback.xml
Normal file
14
modules/text/src/test/resources/logback.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<configuration>
|
||||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<withJansi>true</withJansi>
|
||||
|
||||
<encoder>
|
||||
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<logger name="docspell" level="debug" />
|
||||
<root level="INFO">
|
||||
<appender-ref ref="STDOUT" />
|
||||
</root>
|
||||
</configuration>
|
94
modules/text/src/test/scala/docspell/text/TestFiles.scala
Normal file
94
modules/text/src/test/scala/docspell/text/TestFiles.scala
Normal file
@ -0,0 +1,94 @@
|
||||
package docspell.text
|
||||
|
||||
import cats.effect.{Blocker, IO}
|
||||
import docspell.common.LenientUri
|
||||
import fs2.Stream
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
|
||||
object TestFiles {
|
||||
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
|
||||
implicit val CS = IO.contextShift(ExecutionContext.global)
|
||||
|
||||
|
||||
val letterSourceDE: Stream[IO, Byte] =
|
||||
LenientUri.fromJava(getClass.getResource("/letter-de-source.pdf")).
|
||||
readURL[IO](16 * 1024, blocker)
|
||||
|
||||
val letterSourceEN: Stream[IO, Byte] =
|
||||
LenientUri.fromJava(getClass.getResource("/letter-en-source.pdf")).
|
||||
readURL[IO](16 * 1024, blocker)
|
||||
|
||||
|
||||
val letterDEText = """Max Mustermann
|
||||
|
|
||||
|Lilienweg 21
|
||||
|
|
||||
|12345 Nebendorf
|
||||
|
|
||||
|E-Mail: max.muster@gmail.com
|
||||
|
|
||||
|Max Mustermann, Lilienweg 21, 12345 Nebendorf
|
||||
|
|
||||
|EasyCare AG
|
||||
|Abteilung Buchhaltung
|
||||
|Ackerweg 12
|
||||
|
|
||||
|12346 Ulmen
|
||||
|
|
||||
|Nebendorf, 3. September 2019
|
||||
|Sehr geehrte Damen und Herren,
|
||||
|
|
||||
|hiermit kündige ich meine Mitgliedschaft in der Kranken- und Pflegeversicherung zum
|
||||
|nächstmöglichen Termin.
|
||||
|
|
||||
|Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen eine Kündigungsbe-
|
||||
|stätigung zu.
|
||||
|
|
||||
|Vielen Dank im Vorraus!
|
||||
|
|
||||
|Mit freundlichen Grüßen
|
||||
|
|
||||
|Max Mustermann
|
||||
|""".stripMargin.trim
|
||||
|
||||
val letterENText = """Derek Jeter
|
||||
|
|
||||
|123 Elm Ave.
|
||||
|
|
||||
|Treesville, ON MI1N 2P3
|
||||
|November 7, 2016
|
||||
|
|
||||
|Derek Jeter, 123 Elm Ave., Treesville, ON M1N 2P3, November 7, 2016
|
||||
|
|
||||
|Mr. M. Leat
|
||||
|
|
||||
|Chief of Syrup Production
|
||||
|Old Sticky Pancake Company
|
||||
|456 Maple Lane
|
||||
|
|
||||
|Forest, ON 7TW8 9Y0
|
||||
|
|
||||
|Hemptown, September 3, 2019
|
||||
|Dear Mr. Leaf,
|
||||
|
|
||||
|Let me begin by thanking you for your past contributions to our Little League baseball
|
||||
|team. Your sponsorship aided in the purchase of ten full uniforms and several pieces of
|
||||
|baseball equipment for last year’s season.
|
||||
|
|
||||
|Next month, our company is planning an employee appreciation pancake breakfast hon-
|
||||
|oring retired employees for their past years of service and present employees for their
|
||||
|loyalty and dedication in spite of the current difficult economic conditions.
|
||||
|
|
||||
|We would like to place an order with your company for 25 pounds of pancake mix and
|
||||
|five gallons of maple syrup. We hope you will be able to provide these products in the
|
||||
|bulk quantities we require.
|
||||
|
|
||||
|As you are a committed corporate sponsor and long-time associate, we hope that you
|
||||
|will be able to join us for breakfast on December 12, 2016.
|
||||
|
|
||||
|Respectfully yours,
|
||||
|
|
||||
|Derek Jeter
|
||||
|""".stripMargin.trim
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
package docspell.text.contact
|
||||
|
||||
import docspell.common.{NerLabel, NerTag}
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object ContactAnnotateSpec extends SimpleTestSuite {
|
||||
|
||||
test("find email") {
|
||||
|
||||
val text =
|
||||
"""An email address such as John.Smith@example.com is made up
|
||||
|of a local-part, an @ symbol, then a case-insensitive domain.
|
||||
|Although the standard requires[1] the local part to be
|
||||
|case-sensitive, it also urges that receiving hosts deliver
|
||||
|messages in a case-independent fashion,[2] e.g., that the mail
|
||||
|system at example.com treat John.Smith as equivalent to
|
||||
|john.smith; some mail systems even treat them as equivalent
|
||||
|to johnsmith.[3] Mail systems often limit their users' choice
|
||||
|of name to a subset of the technically valid characters, and
|
||||
|in some cases also limit which addresses it is possible to
|
||||
|send mail to.""".stripMargin
|
||||
|
||||
val labels = Contact.annotate(text)
|
||||
assertEquals(labels.size, 2)
|
||||
assertEquals(labels(0),
|
||||
NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
|
||||
assertEquals(text.substring(25, 47).toLowerCase, "john.smith@example.com")
|
||||
assertEquals(labels(1),
|
||||
NerLabel("example.com", NerTag.Website, 308, 319))
|
||||
assertEquals(text.substring(308, 319).toLowerCase, "example.com")
|
||||
}
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
package docspell.text.date
|
||||
|
||||
import docspell.common.Language
|
||||
import docspell.text.TestFiles
|
||||
import minitest._
|
||||
|
||||
object DateFindSpec extends SimpleTestSuite {
|
||||
|
||||
test("find simple dates") {
|
||||
|
||||
//println(DateFind.findDates(TestFiles.letterDEText, Language.German).toVector)
|
||||
println(DateFind.findDates(TestFiles.letterENText, Language.English).toVector)
|
||||
}
|
||||
}
|
@ -0,0 +1,52 @@
|
||||
package docspell.text.nlp
|
||||
|
||||
import docspell.common.{Language, NerLabel, NerTag}
|
||||
import docspell.text.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TextAnalyserSuite extends SimpleTestSuite {
|
||||
|
||||
test("find english ner labels") {
|
||||
val labels = StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
|
||||
val expect = Vector(NerLabel("Derek",NerTag.Person,0,5)
|
||||
, NerLabel("Jeter",NerTag.Person,6,11)
|
||||
, NerLabel("Treesville",NerTag.Person,27,37)
|
||||
, NerLabel("Derek",NerTag.Person,69,74)
|
||||
, NerLabel("Jeter",NerTag.Person,75,80)
|
||||
, NerLabel("Treesville",NerTag.Location,96,106)
|
||||
, NerLabel("M.",NerTag.Person,142,144)
|
||||
, NerLabel("Leat",NerTag.Person,145,149)
|
||||
, NerLabel("Syrup",NerTag.Organization,160,165)
|
||||
, NerLabel("Production",NerTag.Organization,166,176)
|
||||
, NerLabel("Old",NerTag.Organization,177,180)
|
||||
, NerLabel("Sticky",NerTag.Organization,181,187)
|
||||
, NerLabel("Pancake",NerTag.Organization,188,195)
|
||||
, NerLabel("Company",NerTag.Organization,196,203)
|
||||
, NerLabel("Maple",NerTag.Location,208,213)
|
||||
, NerLabel("Lane",NerTag.Location,214,218)
|
||||
, NerLabel("Forest",NerTag.Location,220,226)
|
||||
, NerLabel("Hemptown",NerTag.Location,241,249)
|
||||
, NerLabel("Little",NerTag.Organization,349,355)
|
||||
, NerLabel("League",NerTag.Organization,356,362)
|
||||
, NerLabel("Derek",NerTag.Person,1119,1124)
|
||||
, NerLabel("Jeter",NerTag.Person,1125,1130))
|
||||
assertEquals(labels, expect)
|
||||
}
|
||||
|
||||
test("find german ner labels") {
|
||||
val labels = StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
|
||||
val expect = Vector(NerLabel("Max", NerTag.Person, 0, 3)
|
||||
, NerLabel("Mustermann", NerTag.Person, 4, 14)
|
||||
, NerLabel("Lilienweg", NerTag.Location, 16, 25)
|
||||
, NerLabel("Max", NerTag.Person, 77, 80)
|
||||
, NerLabel("Mustermann", NerTag.Person, 81, 91)
|
||||
, NerLabel("Lilienweg", NerTag.Location, 93, 102)
|
||||
, NerLabel("EasyCare", NerTag.Organization, 124, 132)
|
||||
, NerLabel("AG", NerTag.Organization, 133, 135)
|
||||
, NerLabel("Ackerweg", NerTag.Location, 158, 166)
|
||||
, NerLabel("Nebendorf", NerTag.Location, 184, 193)
|
||||
, NerLabel("Max", NerTag.Person, 505, 508)
|
||||
, NerLabel("Mustermann", NerTag.Person, 509, 519))
|
||||
assertEquals(labels, expect)
|
||||
}
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
package docspell.text.ocr
|
||||
|
||||
import cats.effect.IO
|
||||
import docspell.text.TestFiles
|
||||
import minitest.SimpleTestSuite
|
||||
|
||||
object TextExtractionSuite extends SimpleTestSuite {
|
||||
import TestFiles._
|
||||
|
||||
test("extract english pdf") {
|
||||
ignore()
|
||||
val text = TextExtract.extract[IO](letterSourceEN, blocker, "eng", Config.default).
|
||||
compile.lastOrError.unsafeRunSync()
|
||||
println(text)
|
||||
}
|
||||
|
||||
test("extract german pdf") {
|
||||
ignore()
|
||||
val expect = TestFiles.letterDEText
|
||||
val extract = TextExtract.extract[IO](letterSourceDE, blocker, "deu", Config.default).
|
||||
compile.lastOrError.unsafeRunSync()
|
||||
|
||||
assertEquals(extract.trim, expect.trim)
|
||||
}
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
package docspell.text.split
|
||||
|
||||
import minitest._
|
||||
|
||||
object TestSplitterSpec extends SimpleTestSuite {
|
||||
|
||||
test("simple splitting") {
|
||||
val text = """hiermit kündige ich meine Mitgliedschaft in der Kranken- und
|
||||
|Pflegeversicherung zum nächstmöglichen Termin.
|
||||
|
|
||||
|Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen
|
||||
|eine Kündigungsbestätigung zu.
|
||||
|
|
||||
|Vielen Dank im Vorraus!""".stripMargin
|
||||
|
||||
val words = TextSplitter.splitToken(text, " \t\r\n".toSet).toVector
|
||||
|
||||
|
||||
assertEquals(words.size, 31)
|
||||
assertEquals(words(13), Word("bitte", 109, 114))
|
||||
assertEquals(text.substring(109, 114).toLowerCase, "bitte")
|
||||
}
|
||||
|
||||
}
|
Reference in New Issue
Block a user