Initial version.

Features:

- Upload PDF files let them analyze

- Manage meta data and items

- See processing in webapp
This commit is contained in:
Eike Kettner
2019-07-23 00:53:30 +02:00
parent 6154e6a387
commit 831cd8b655
341 changed files with 23634 additions and 484 deletions

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,14 @@
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<withJansi>true</withJansi>
<encoder>
<pattern>[%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n</pattern>
</encoder>
</appender>
<logger name="docspell" level="debug" />
<root level="INFO">
<appender-ref ref="STDOUT" />
</root>
</configuration>

View File

@ -0,0 +1,94 @@
package docspell.text
import cats.effect.{Blocker, IO}
import docspell.common.LenientUri
import fs2.Stream
import scala.concurrent.ExecutionContext
object TestFiles {
val blocker = Blocker.liftExecutionContext(ExecutionContext.global)
implicit val CS = IO.contextShift(ExecutionContext.global)
val letterSourceDE: Stream[IO, Byte] =
LenientUri.fromJava(getClass.getResource("/letter-de-source.pdf")).
readURL[IO](16 * 1024, blocker)
val letterSourceEN: Stream[IO, Byte] =
LenientUri.fromJava(getClass.getResource("/letter-en-source.pdf")).
readURL[IO](16 * 1024, blocker)
val letterDEText = """Max Mustermann
|
|Lilienweg 21
|
|12345 Nebendorf
|
|E-Mail: max.muster@gmail.com
|
|Max Mustermann, Lilienweg 21, 12345 Nebendorf
|
|EasyCare AG
|Abteilung Buchhaltung
|Ackerweg 12
|
|12346 Ulmen
|
|Nebendorf, 3. September 2019
|Sehr geehrte Damen und Herren,
|
|hiermit kündige ich meine Mitgliedschaft in der Kranken- und Pflegeversicherung zum
|nächstmöglichen Termin.
|
|Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen eine Kündigungsbe-
|stätigung zu.
|
|Vielen Dank im Vorraus!
|
|Mit freundlichen Grüßen
|
|Max Mustermann
|""".stripMargin.trim
val letterENText = """Derek Jeter
|
|123 Elm Ave.
|
|Treesville, ON MI1N 2P3
|November 7, 2016
|
|Derek Jeter, 123 Elm Ave., Treesville, ON M1N 2P3, November 7, 2016
|
|Mr. M. Leat
|
|Chief of Syrup Production
|Old Sticky Pancake Company
|456 Maple Lane
|
|Forest, ON 7TW8 9Y0
|
|Hemptown, September 3, 2019
|Dear Mr. Leaf,
|
|Let me begin by thanking you for your past contributions to our Little League baseball
|team. Your sponsorship aided in the purchase of ten full uniforms and several pieces of
|baseball equipment for last years season.
|
|Next month, our company is planning an employee appreciation pancake breakfast hon-
|oring retired employees for their past years of service and present employees for their
|loyalty and dedication in spite of the current difficult economic conditions.
|
|We would like to place an order with your company for 25 pounds of pancake mix and
|five gallons of maple syrup. We hope you will be able to provide these products in the
|bulk quantities we require.
|
|As you are a committed corporate sponsor and long-time associate, we hope that you
|will be able to join us for breakfast on December 12, 2016.
|
|Respectfully yours,
|
|Derek Jeter
|""".stripMargin.trim
}

View File

@ -0,0 +1,32 @@
package docspell.text.contact
import docspell.common.{NerLabel, NerTag}
import minitest.SimpleTestSuite
object ContactAnnotateSpec extends SimpleTestSuite {
test("find email") {
val text =
"""An email address such as John.Smith@example.com is made up
|of a local-part, an @ symbol, then a case-insensitive domain.
|Although the standard requires[1] the local part to be
|case-sensitive, it also urges that receiving hosts deliver
|messages in a case-independent fashion,[2] e.g., that the mail
|system at example.com treat John.Smith as equivalent to
|john.smith; some mail systems even treat them as equivalent
|to johnsmith.[3] Mail systems often limit their users' choice
|of name to a subset of the technically valid characters, and
|in some cases also limit which addresses it is possible to
|send mail to.""".stripMargin
val labels = Contact.annotate(text)
assertEquals(labels.size, 2)
assertEquals(labels(0),
NerLabel("john.smith@example.com", NerTag.Email, 25, 47))
assertEquals(text.substring(25, 47).toLowerCase, "john.smith@example.com")
assertEquals(labels(1),
NerLabel("example.com", NerTag.Website, 308, 319))
assertEquals(text.substring(308, 319).toLowerCase, "example.com")
}
}

View File

@ -0,0 +1,14 @@
package docspell.text.date
import docspell.common.Language
import docspell.text.TestFiles
import minitest._
object DateFindSpec extends SimpleTestSuite {
test("find simple dates") {
//println(DateFind.findDates(TestFiles.letterDEText, Language.German).toVector)
println(DateFind.findDates(TestFiles.letterENText, Language.English).toVector)
}
}

View File

@ -0,0 +1,52 @@
package docspell.text.nlp
import docspell.common.{Language, NerLabel, NerTag}
import docspell.text.TestFiles
import minitest.SimpleTestSuite
object TextAnalyserSuite extends SimpleTestSuite {
test("find english ner labels") {
val labels = StanfordNerClassifier.nerAnnotate(Language.English)(TestFiles.letterENText)
val expect = Vector(NerLabel("Derek",NerTag.Person,0,5)
, NerLabel("Jeter",NerTag.Person,6,11)
, NerLabel("Treesville",NerTag.Person,27,37)
, NerLabel("Derek",NerTag.Person,69,74)
, NerLabel("Jeter",NerTag.Person,75,80)
, NerLabel("Treesville",NerTag.Location,96,106)
, NerLabel("M.",NerTag.Person,142,144)
, NerLabel("Leat",NerTag.Person,145,149)
, NerLabel("Syrup",NerTag.Organization,160,165)
, NerLabel("Production",NerTag.Organization,166,176)
, NerLabel("Old",NerTag.Organization,177,180)
, NerLabel("Sticky",NerTag.Organization,181,187)
, NerLabel("Pancake",NerTag.Organization,188,195)
, NerLabel("Company",NerTag.Organization,196,203)
, NerLabel("Maple",NerTag.Location,208,213)
, NerLabel("Lane",NerTag.Location,214,218)
, NerLabel("Forest",NerTag.Location,220,226)
, NerLabel("Hemptown",NerTag.Location,241,249)
, NerLabel("Little",NerTag.Organization,349,355)
, NerLabel("League",NerTag.Organization,356,362)
, NerLabel("Derek",NerTag.Person,1119,1124)
, NerLabel("Jeter",NerTag.Person,1125,1130))
assertEquals(labels, expect)
}
test("find german ner labels") {
val labels = StanfordNerClassifier.nerAnnotate(Language.German)(TestFiles.letterDEText)
val expect = Vector(NerLabel("Max", NerTag.Person, 0, 3)
, NerLabel("Mustermann", NerTag.Person, 4, 14)
, NerLabel("Lilienweg", NerTag.Location, 16, 25)
, NerLabel("Max", NerTag.Person, 77, 80)
, NerLabel("Mustermann", NerTag.Person, 81, 91)
, NerLabel("Lilienweg", NerTag.Location, 93, 102)
, NerLabel("EasyCare", NerTag.Organization, 124, 132)
, NerLabel("AG", NerTag.Organization, 133, 135)
, NerLabel("Ackerweg", NerTag.Location, 158, 166)
, NerLabel("Nebendorf", NerTag.Location, 184, 193)
, NerLabel("Max", NerTag.Person, 505, 508)
, NerLabel("Mustermann", NerTag.Person, 509, 519))
assertEquals(labels, expect)
}
}

View File

@ -0,0 +1,25 @@
package docspell.text.ocr
import cats.effect.IO
import docspell.text.TestFiles
import minitest.SimpleTestSuite
object TextExtractionSuite extends SimpleTestSuite {
import TestFiles._
test("extract english pdf") {
ignore()
val text = TextExtract.extract[IO](letterSourceEN, blocker, "eng", Config.default).
compile.lastOrError.unsafeRunSync()
println(text)
}
test("extract german pdf") {
ignore()
val expect = TestFiles.letterDEText
val extract = TextExtract.extract[IO](letterSourceDE, blocker, "deu", Config.default).
compile.lastOrError.unsafeRunSync()
assertEquals(extract.trim, expect.trim)
}
}

View File

@ -0,0 +1,24 @@
package docspell.text.split
import minitest._
object TestSplitterSpec extends SimpleTestSuite {
test("simple splitting") {
val text = """hiermit kündige ich meine Mitgliedschaft in der Kranken- und
|Pflegeversicherung zum nächstmöglichen Termin.
|
|Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen
|eine Kündigungsbestätigung zu.
|
|Vielen Dank im Vorraus!""".stripMargin
val words = TextSplitter.splitToken(text, " \t\r\n".toSet).toVector
assertEquals(words.size, 31)
assertEquals(words(13), Word("bitte", 109, 114))
assertEquals(text.substring(109, 114).toLowerCase, "bitte")
}
}