Merge pull request #1190 from eikek/update-stanford-core-nlp

Update stanford core nlp
2025-10-03 18:06:57 +00:00 · 2021-11-20 14:09:04 +00:00
parent 1c7819c9f6 501c6f2988
commit aecc689240
19 changed files with 178 additions and 43 deletions
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@@ -45,15 +45,16 @@ object DateFind {
  private[this] val jpnChars =
    ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
-  private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
+  private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
    val stext =
      if (lang == Language.Japanese) {
        text.map(c => if (jpnChars.contains(c)) c else ' ')
      } else text
    TextSplitter
-      .splitToken(stext, " \t.,\n\r/年月日".toSet)
+      .splitToken(stext, " -\t.,\n\r/年月日".toSet)
      .filter(w => lang != Language.Latvian || w.value != "gada")
      .filter(w => lang != Language.Spanish || w.value != "de")
  }
  case class SimpleDate(year: Int, month: Int, day: Int) {
@@ -91,6 +92,7 @@ object DateFind {
        case Language.French     => dmy.or(ymd).or(mdy)
        case Language.Italian    => dmy.or(ymd).or(mdy)
        case Language.Spanish    => dmy.or(ymd).or(mdy)
        case Language.Hungarian  => ymd
        case Language.Czech      => dmy.or(ymd).or(mdy)
        case Language.Danish     => dmy.or(ymd).or(mdy)
        case Language.Finnish    => dmy.or(ymd).or(mdy)
--- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
@@ -30,6 +30,8 @@ object MonthName {
        italian
      case Language.Spanish =>
        spanish
      case Language.Hungarian =>
        hungarian
      case Language.Swedish =>
        swedish
      case Language.Norwegian =>
@@ -324,4 +326,19 @@ object MonthName {
    List("11", "נובמבר"),
    List("12", "דצמבר")
  )
  private val hungarian = List(
    List("I", "jan", "január"),
    List("II", "febr", "február"),
    List("III", "márc", "március"),
    List("IV", "ápr", "április"),
    List("V", "máj", "május"),
    List("VI", "jún", "június"),
    List("VII", "júl", "július"),
    List("VIII", "aug", "augusztus"),
    List("IX", "szept", "szeptember"),
    List("X", "okt", "október"),
    List("XI", "nov", "november"),
    List("XII", "dec", "december")
  )
 }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
@@ -29,7 +29,7 @@ object BasicCRFAnnotator {
  private[this] val logger = getLogger
  // assert correct resource names
-  List(Language.French, Language.German, Language.English).foreach(classifierResource)
+  NLPLanguage.all.toList.foreach(classifierResource)
  type Annotator = AbstractSequenceClassifier[CoreLabel]
@@ -70,6 +70,12 @@ object BasicCRFAnnotator {
        "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
      case Language.English =>
        "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
      case Language.Spanish =>
        "/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
      // case Language.Italian =>
      //   "/edu/stanford/nlp/models/ner/italian.crf.ser.gz"
      // case Language.Hungarian =>
      //   "/edu/stanford/nlp/models/ner/hungarian.crf.ser.gz"
    })
  }
@@ -77,12 +83,14 @@ object BasicCRFAnnotator {
    private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
    private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
    private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
    private[this] lazy val spanishNerClassifier = makeAnnotator(Language.Spanish)
    def forLang(language: NLPLanguage): Annotator =
      language match {
        case Language.French  => frenchNerClassifier
        case Language.German  => germanNerClassifier
        case Language.English => englishNerClassifier
        case Language.Spanish => spanishNerClassifier
      }
  }
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@@ -37,6 +37,8 @@ object Properties {
            Properties.nerEnglish(regexNerFile)
          case Language.French =>
            Properties.nerFrench(regexNerFile, highRecall)
          case Language.Spanish =>
            Properties.nerSpanish(regexNerFile, highRecall)
        }
      case StanfordNerSettings.RegexOnly(path) =>
        Properties.regexNerOnly(path)
@@ -88,6 +90,18 @@ object Properties {
      "ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
  def nerSpanish(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
    Properties(
      "annotators" -> "tokenize, ssplit, mwt, pos, lemma, ner",
      "tokenize.language" -> "es",
      "mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv",
      "pos.model" -> "edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger",
      "ner.model" -> "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz",
      "ner.applyNumericClassifiers" -> "true",
      "ner.useSUTime" -> "false",
      "ner.language" -> "es"
    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
  def regexNerOnly(regexNerMappingFile: Path): JProps =
    Properties(
      "annotators" -> "tokenize,ssplit"
--- a/modules/analysis/src/test/resources/test.ser.gz
+++ b/modules/analysis/src/test/resources/test.ser.gz
--- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala
@@ -13,7 +13,7 @@ import docspell.files.TestFiles
 import munit._
-class DateFindSpec extends FunSuite {
+class DateFindTest extends FunSuite {
  test("find simple dates") {
    val expect = Vector(
@@ -179,4 +179,29 @@ class DateFindSpec extends FunSuite {
    )
  }
  test("find spanish dates") {
    assertEquals(
      DateFind
        .findDates("México, Distrito Federal a 15 de Diciembre de 2011", Language.Spanish)
        .toVector,
      Vector(
        NerDateLabel(
          LocalDate.of(2011, 12, 15),
          NerLabel("15 de Diciembre de 2011", NerTag.Date, 27, 50)
        )
      )
    )
    println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
    assertEquals(
      DateFind
        .findDates("2021-11-19", Language.Spanish)
        .toVector,
      Vector(
        NerDateLabel(
          LocalDate.of(2021, 11, 19),
          NerLabel("2021-11-19", NerTag.Date, 0, 10)
        )
      )
    )
  }
 }
--- a/modules/common/src/main/scala/docspell/common/Language.scala
+++ b/modules/common/src/main/scala/docspell/common/Language.scala
@@ -30,7 +30,7 @@ object Language {
    override val allowsNLP = true
  }
  object NLPLanguage {
-    val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
+    val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French, Spanish)
  }
  case object German extends NLPLanguage {
@@ -53,11 +53,16 @@ object Language {
    val iso3 = "ita"
  }
-  case object Spanish extends Language {
+  case object Spanish extends NLPLanguage {
    val iso2 = "es"
    val iso3 = "spa"
  }
  case object Hungarian extends Language {
    val iso2 = "hu"
    val iso3 = "hun"
  }
  case object Portuguese extends Language {
    val iso2 = "pt"
    val iso3 = "por"
@@ -125,6 +130,7 @@ object Language {
      French,
      Italian,
      Spanish,
      Hungarian,
      Dutch,
      Portuguese,
      Czech,
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
@@ -127,7 +127,13 @@ object SolrSetup {
            "Add hebrew content field",
            addContentField(Language.Hebrew)
          ),
-          SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field")
+          SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field"),
          SolrMigration[F](
            19,
            "Add hungarian",
            addContentField(Language.Hungarian)
          ),
          SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
        )
      def addFolderField: F[Unit] =
--- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
@@ -18,6 +18,7 @@ import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
 import docspell.store.queries.QItem
 import docspell.store.records.RAttachment
 import docspell.store.records.RAttachmentSource
 import docspell.store.records.RCollective
@@ -131,10 +132,13 @@ object ReProcessItem {
  def getLanguage[F[_]: Sync]: Task[F, Args, Language] =
    Task { ctx =>
-      (for {
+      val lang1 = OptionT(
-        coll <- OptionT(ctx.store.transact(RCollective.findByItem(ctx.args.itemId)))
+        ctx.store.transact(QItem.getItemLanguage(ctx.args.itemId)).map(_.headOption)
-        lang = coll.language
+      )
-      } yield lang).getOrElse(Language.German)
+      val lang2 = OptionT(ctx.store.transact(RCollective.findByItem(ctx.args.itemId)))
        .map(_.language)
      lang1.orElse(lang2).getOrElse(Language.German)
    }
  def isLastRetry[F[_]: Sync]: Task[F, Args, Boolean] =
--- a/modules/store/src/main/resources/db/migration/postgresql/V1.29.0__reset_classifier_file.sql
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.29.0__reset_classifier_file.sql
@@ -0,0 +1,21 @@
 CREATE TEMPORARY TABLE "temp_file_ids" (
  cid varchar(254) not null,
  file_id varchar(254) not null
 );
 INSERT INTO "temp_file_ids" SELECT "cid", "file_id" FROM "classifier_model";
 INSERT INTO "job"
       SELECT md5(random()::text), 'learn-classifier', cid, '{"collective":"' || cid || '"}',
              'new classifier', now(), 'docspell-system', 0, 'waiting', 0, 0
       FROM "classifier_setting";
 DELETE FROM "classifier_model";
 DELETE FROM "filemeta"
 WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
 DELETE FROM "filechunk"
 WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
 DROP TABLE "temp_file_ids";
--- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala
+++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala
@@ -714,4 +714,13 @@ object QItem {
      txt = texts.map(_._1).mkString(pageSep)
    } yield TextAndTag(itemId, txt, tag)
  /** Gets the language of the first attachment of the given item. */
  def getItemLanguage(itemId: Ident): ConnectionIO[List[Language]] =
    Select(
      select(m.language),
      from(m)
        .innerJoin(a, a.id === m.id)
        .innerJoin(i, i.id === a.itemId),
      i.id === itemId
    ).orderBy(a.position.asc).build.query[Language].to[List]
 }
--- a/modules/webapp/src/main/elm/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Data/Language.elm
@@ -31,6 +31,7 @@ type Language
    | Latvian
    | Japanese
    | Hebrew
    | Hungarian
 fromString : String -> Maybe Language
@@ -86,6 +87,9 @@ fromString str =
    else if str == "heb" || str == "he" || str == "hebrew" then
        Just Hebrew
    else if str == "hun" || str == "hu" || str == "hungarian" then
        Just Hungarian
    else
        Nothing
@@ -144,6 +148,9 @@ toIso3 lang =
        Hebrew ->
            "heb"
        Hungarian ->
            "hun"
 all : List Language
 all =
@@ -164,4 +171,5 @@ all =
    , Latvian
    , Japanese
    , Hebrew
    , Hungarian
    ]
--- a/modules/webapp/src/main/elm/Messages/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm
@@ -67,6 +67,9 @@ gb lang =
        Hebrew ->
            "Hebrew"
        Hungarian ->
            "Hungarian"
 de : Language -> String
 de lang =
@@ -121,3 +124,6 @@ de lang =
        Hebrew ->
            "Hebräisch"
        Hungarian ->
            "Ungarisch"
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -914,7 +914,7 @@ in {
                      The full and basic variants rely on pre-build language models
                      that are available for only 3 lanugages at the moment: German,
-                      English and French.
+                      English, French and Spanish.
                      Memory usage varies greatly among the languages. German has
                      quite large models, that require about 1G heap. So joex should
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -40,7 +40,7 @@ object Dependencies {
  val ScalaJavaTimeVersion = "2.3.0"
  val ScodecBitsVersion = "1.1.29"
  val Slf4jVersion = "1.7.32"
-  val StanfordNlpVersion = "4.2.2"
+  val StanfordNlpVersion = "4.3.2"
  val TikaVersion = "2.1.0"
  val YamuscaVersion = "0.8.1"
  val SwaggerUIVersion = "4.1.0"
@@ -185,18 +185,16 @@ object Dependencies {
    )
  )
-  val stanfordNlpModels = Seq(
+  val stanfordNlpModels = {
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+    val artifact = "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion
-      .classifier("models"),
+    Seq(
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+      artifact.classifier("models"),
-      .classifier("models-german"),
+      artifact.classifier("models-german"),
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+      artifact.classifier("models-french"),
-      .classifier("models-french"),
+      artifact.classifier("models-english"),
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
+      artifact.classifier("models-spanish")
-      .classifier(
+    )
-        "models-english"
+  }
      )
  )
  val tika = Seq(
    "org.apache.tika" % "tika-core" % TikaVersion
--- a/project/NerModelsPlugin.scala
+++ b/project/NerModelsPlugin.scala
@@ -67,18 +67,29 @@ object NerModelsPlugin extends AutoPlugin {
  }
  private val nerModels = List(
-    "german.distsim.crf.ser.gz",
+    // English
    "english.conll.4class.distsim.crf.ser.gz",
    "regexner_caseless.tab",
    "regexner_cased.tab",
    "english-left3words-distsim.tagger",
    "english-left3words-distsim.tagger.props",
    // German
    "german.distsim.crf.ser.gz",
    "german-mwt.tsv",
    "german-ud.tagger",
    "german-ud.tagger.props",
    // French
    "french-wikiner-4class.crf.ser.gz",
    "french-mwt-statistical.tsv",
    "french-mwt.tagger",
    "french-mwt.tsv",
    "german-mwt.tsv",
    "german-ud.tagger",
    "german-ud.tagger.props",
    "french-ud.tagger",
    "french-ud.tagger.props",
-    "english-left3words-distsim.tagger",
+    // Spanish
-    "english-left3words-distsim.tagger.props"
+    "spanish.ancora.distsim.s512.crf.ser.gz",
    "spanish-mwt.tsv",
    "spanish-ud.tagger",
    "kbp_regexner_number_sp.tag",
    "kbp_regexner_mapping_sp.tag"
  )
 }
--- a/website/site/content/docs/configure/_index.md
+++ b/website/site/content/docs/configure/_index.md
@@ -486,8 +486,8 @@ This setting defines which NLP mode to use. It defaults to `full`,
 which requires more memory for certain languages (with the advantage
 of better results). Other values are `basic`, `regexonly` and
 `disabled`. The modes `full` and `basic` use pre-defined lanugage
-models for procesing documents of languaes German, English and French.
+models for procesing documents of languaes German, English, French and
-These require some amount of memory (see below).
+Spanish. These require some amount of memory (see below).
 The mode `basic` is like the "light" variant to `full`. It doesn't use
 all NLP features, which makes memory consumption much lower, but comes
--- a/website/site/content/docs/joex/file-processing.md
+++ b/website/site/content/docs/joex/file-processing.md
@@ -8,10 +8,10 @@ mktoc = true
 +++
 When uploading a file, it is only saved to the database together with
-the given meta information. The file is not visible in the ui yet.
+the given meta information as a "job". The file is not visible in the
-Then joex takes the next such file (or files in case you uploaded
+ui yet. Then joex takes the next such job and starts processing it.
-many) and starts processing it. When processing finished, the item and
+When processing finished, the item and its files will show up in the
-its files will show up in the ui.
+ui.
 If an error occurs during processing, the item will be created
 anyways, so you can see it. Depending on the error, some information
@@ -400,7 +400,7 @@ names etc. This also requires a statistical model, but this time for a
 whole language. These are also provided by [Stanford
 NLP](https://nlp.stanford.edu/software/), but not for all languages.
 So whether this can be used depends on the document language. Models
-exist for German, English and French currently.
+exist for German, English, French and Spanish currently.
 Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to
 run custom rules against a text. This can be used as a fallback for
--- a/website/site/content/docs/webapp/metadata.md
+++ b/website/site/content/docs/webapp/metadata.md
@@ -147,11 +147,11 @@ experience. The features of text analysis strongly depend on the
 language. Docspell uses the [Stanford NLP
 Library](https://nlp.stanford.edu/software/) for its great machine
 learning algorithms. Some of them, like certain NLP features, are only
-available for some languages – namely German, English and French. The
+available for some languages – namely German, English, French and
-reason is that the required statistical models are not available for
+Spanish. The reason is that the required statistical models are not
-other languages. However, docspell can still run other algorithms for
+available for other languages. However, docspell can still run other
-the other languages, like classification and custom rules based on the
+algorithms for the other languages, like classification and custom
-address book.
+rules based on the address book.
 More information about file processing and text analysis can be found
 [here](@/docs/joex/file-processing.md#text-analysis).