From 501c6f2988c560424ccc4d19eaa828d86b8df6a1 Mon Sep 17 00:00:00 2001
From: eikek <eike.kettner@posteo.de>
Date: Sat, 20 Nov 2021 14:31:39 +0100
Subject: [PATCH] Updating stanford corenlp to 4.3.2; adding more languages

There are models for Spanish, that have been added now. Also the
Hungarian language has been added to the list of supported
languages (for tesseract mainly, no nlp models)
---
 .../docspell/analysis/date/DateFind.scala     |   6 ++--
 .../docspell/analysis/date/MonthName.scala    |  17 +++++++++++
 .../analysis/nlp/BasicCRFAnnotator.scala      |  10 ++++++-
 .../docspell/analysis/nlp/Properties.scala    |  14 +++++++++
 .../analysis/src/test/resources/test.ser.gz   | Bin 1682 -> 1665 bytes
 ...{DateFindSpec.scala => DateFindTest.scala} |  27 +++++++++++++++++-
 .../main/scala/docspell/common/Language.scala |  10 +++++--
 .../scala/docspell/ftssolr/SolrSetup.scala    |   8 +++++-
 .../docspell/joex/process/ReProcessItem.scala |   2 +-
 .../V1.29.0__reset_classifier_file.sql        |  21 ++++++++++++++
 modules/webapp/src/main/elm/Data/Language.elm |   8 ++++++
 .../src/main/elm/Messages/Data/Language.elm   |   6 ++++
 nix/module-joex.nix                           |   2 +-
 project/Dependencies.scala                    |  24 +++++++---------
 project/NerModelsPlugin.scala                 |  23 +++++++++++----
 website/site/content/docs/configure/_index.md |   4 +--
 .../site/content/docs/joex/file-processing.md |  10 +++----
 website/site/content/docs/webapp/metadata.md  |  10 +++----
 18 files changed, 162 insertions(+), 40 deletions(-)
 rename modules/analysis/src/test/scala/docspell/analysis/date/{DateFindSpec.scala => DateFindTest.scala} (86%)
 create mode 100644 modules/store/src/main/resources/db/migration/postgresql/V1.29.0__reset_classifier_file.sql

diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
index dd5f3baf..fa3c5d1c 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala
@@ -45,15 +45,16 @@ object DateFind {
   private[this] val jpnChars =
     ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
 
-  private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
+  private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
     val stext =
       if (lang == Language.Japanese) {
         text.map(c => if (jpnChars.contains(c)) c else ' ')
       } else text
 
     TextSplitter
-      .splitToken(stext, " \t.,\n\r/年月日".toSet)
+      .splitToken(stext, " -\t.,\n\r/年月日".toSet)
       .filter(w => lang != Language.Latvian || w.value != "gada")
+      .filter(w => lang != Language.Spanish || w.value != "de")
   }
 
   case class SimpleDate(year: Int, month: Int, day: Int) {
@@ -91,6 +92,7 @@ object DateFind {
         case Language.French     => dmy.or(ymd).or(mdy)
         case Language.Italian    => dmy.or(ymd).or(mdy)
         case Language.Spanish    => dmy.or(ymd).or(mdy)
+        case Language.Hungarian  => ymd
         case Language.Czech      => dmy.or(ymd).or(mdy)
         case Language.Danish     => dmy.or(ymd).or(mdy)
         case Language.Finnish    => dmy.or(ymd).or(mdy)
diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
index a447eb0b..0679e1b3 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala
@@ -30,6 +30,8 @@ object MonthName {
         italian
       case Language.Spanish =>
         spanish
+      case Language.Hungarian =>
+        hungarian
       case Language.Swedish =>
         swedish
       case Language.Norwegian =>
@@ -324,4 +326,19 @@ object MonthName {
     List("11", "נובמבר"),
     List("12", "דצמבר")
   )
+
+  private val hungarian = List(
+    List("I", "jan", "január"),
+    List("II", "febr", "február"),
+    List("III", "márc", "március"),
+    List("IV", "ápr", "április"),
+    List("V", "máj", "május"),
+    List("VI", "jún", "június"),
+    List("VII", "júl", "július"),
+    List("VIII", "aug", "augusztus"),
+    List("IX", "szept", "szeptember"),
+    List("X", "okt", "október"),
+    List("XI", "nov", "november"),
+    List("XII", "dec", "december")
+  )
 }
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
index 04dc33cc..ae580992 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/BasicCRFAnnotator.scala
@@ -29,7 +29,7 @@ object BasicCRFAnnotator {
   private[this] val logger = getLogger
 
   // assert correct resource names
-  List(Language.French, Language.German, Language.English).foreach(classifierResource)
+  NLPLanguage.all.toList.foreach(classifierResource)
 
   type Annotator = AbstractSequenceClassifier[CoreLabel]
 
@@ -70,6 +70,12 @@ object BasicCRFAnnotator {
         "/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
       case Language.English =>
         "/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+      case Language.Spanish =>
+        "/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
+      // case Language.Italian =>
+      //   "/edu/stanford/nlp/models/ner/italian.crf.ser.gz"
+      // case Language.Hungarian =>
+      //   "/edu/stanford/nlp/models/ner/hungarian.crf.ser.gz"
     })
   }
 
@@ -77,12 +83,14 @@ object BasicCRFAnnotator {
     private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
     private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
     private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
+    private[this] lazy val spanishNerClassifier = makeAnnotator(Language.Spanish)
 
     def forLang(language: NLPLanguage): Annotator =
       language match {
         case Language.French  => frenchNerClassifier
         case Language.German  => germanNerClassifier
         case Language.English => englishNerClassifier
+        case Language.Spanish => spanishNerClassifier
       }
   }
 
diff --git a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
index f5e903bd..cae02474 100644
--- a/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
+++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/Properties.scala
@@ -37,6 +37,8 @@ object Properties {
             Properties.nerEnglish(regexNerFile)
           case Language.French =>
             Properties.nerFrench(regexNerFile, highRecall)
+          case Language.Spanish =>
+            Properties.nerSpanish(regexNerFile, highRecall)
         }
       case StanfordNerSettings.RegexOnly(path) =>
         Properties.regexNerOnly(path)
@@ -88,6 +90,18 @@ object Properties {
       "ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
     ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
 
+  def nerSpanish(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
+    Properties(
+      "annotators" -> "tokenize, ssplit, mwt, pos, lemma, ner",
+      "tokenize.language" -> "es",
+      "mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv",
+      "pos.model" -> "edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger",
+      "ner.model" -> "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz",
+      "ner.applyNumericClassifiers" -> "true",
+      "ner.useSUTime" -> "false",
+      "ner.language" -> "es"
+    ).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
+
   def regexNerOnly(regexNerMappingFile: Path): JProps =
     Properties(
       "annotators" -> "tokenize,ssplit"
diff --git a/modules/analysis/src/test/resources/test.ser.gz b/modules/analysis/src/test/resources/test.ser.gz
index b6d0956ba0f2100bc670502e77717ac428688a9d..bf40994600b14353c66f3deb95c03fc1a69ccf28 100644
GIT binary patch
literal 1665
zcmV-{27dV;iwFP!000000G(G|Y*a-QKD$s#|Npk-Cx9lRMhLqhDw;I0v|S5JE~U^)
zb=L=`cW3Voy*u|ZGq=0NL__=!_~74z1`|z`U<5V37=1Cs7@qxmAtWj$K1eh?nD}5~
zJ#&Bd?zTmYH`(1g=jS_T&N*}D?1kT;MGDwKi(XnP!b^@Qrn&8=O`Ax`N{eZOaZ1E^
zuw+ztf4sDG`pw>I5?b=Gj&Vf=H=wgbiSh)^@gl7$*lHkPCIZYLV8&C-&iL$M1KMpe
zM{R#~0-l6K0oqkrP+4~DqEwJ9WDC$*rOYZTT&^h;vNadb&)Zh`sbSOmi7fkI2d<yV
zy*%{S)g-J%=WNHEr$qx=ndVVM>M@R!1rqpjkhthct21|;n%YWl1J7VmA+t@=3USMh
zpbBnpV4Ds%Js~JpndwMkIPkDfOT}KHk(Smgm%bc2_VpS_PT<BJQk)>BazqZ=>U*xa
zBYVMrs_XHqSHIN|Z6HP)6@n~kziOwx+4{z1^41zi<Y292Cn;LinrkDiZLwDDX}a#|
zxBE6-{`<vqzb7GKAiyFw72J7?-EE&nG2EiAm48RyKI!lhVL~fhIeT=^J3}x0l0?4*
z+;Iy<t<ihNd-@XjbTH7ke^m{gR+%21lS*J94|~+3MXg&F4##MS(j(``y^$k7Cv|{Y
zuv!#FtUdn$>rj=GIh!_u-TT8wpIxjt?`wy2m=jVLYzz25?P1nB>sd=GXcg3?%vJSb
zunD))IXoy975&%Ve)rE)&p#OWwm@u8`_>!FO?s6%DxSOW`qqxuemq^bs&o-AILxHJ
z;Y~rR^=qfL>gPVGms|AL#(>rBfA?v9Pv}GS(77|I56t|w2sR1J9u(7u?efKrrJ}X;
za3rVc4-|}>H;Hk__9{F}6p0(f_6eI<a`@S=UOE40ihdjiP{4iv2kERe+x_ZySLc)P
zSPs@5p463(hkT9^NsC$NnPZ&O;!zz^IY}n-&|wm@OegmVQjs}GnZl70*{nzubNGaA
zH96=hGU?i6krqSC&`Mp&Y=`Gz8?7-lTEY%DBM9S+TN8elV<&xtJamO!Z^983qH-`8
zt`DMc#j8wvuImVug`to>>vy!7=+x(7eTmtWlM2m2uZ5IRE~``op5&n05{~Dh*?>@Q
zw|VGf@(3lizgdK7J?=?`(aS=My<1Q#3+w&g3aQm!83K0Mtin{V2`cl@XFJxkYcn;A
zc^G#*t`Hh&xLqzWH<dhe<C3WcLYwWNizWs=4?XqwD9W8^(tkH7w{TOq)YuKAPV}Tn
zg7IY%DOcb=<MdAvWaVLNKyJ=rmYPiBFPd&azV531=<7TNZQ8`95=E{T174kR<|z-8
zcK|wM*(;Usv{N0zfp8UNT2~;3Q!)&q<}!<;GbTX~g6yD9w`_`Cu7NOJCN2hObDbb(
znJQ1sjD<c>fz=~Vh&ZxYybc_j;?hLLc2CmLB5{?mI}a(nu<d%Lrm($Y_J)>`X9RO?
zoz^yFj4@8cBBlu6{ZNKk&%<p<F`~Tr$`c5u3?c(Z(d!iw%dD<sBBKQvlVgGPpd9ir
zi0@N^mRM~XqswX$UCSPx48T@(p*vzk8k-}+YH<BtgDVlS&s*tym)GpG_H|leU0Haz
z^t)FHY{$n7r3MzLm~$jO<SINrLH*$EU5)Z=KyT26_Ak0rU^n47FWkLy1kzWL%mjsP
z`XxifZ~?l4&ZUcAmHp`_XF#fn7z!&$-wbu8>SS4ka(x(x9fJ=BIL)WrzZdQ)Kxbpm
z)U>1l1C6v0N3(VqFlg4#WH7=kjtUDM?hl3seSc+wXrkeEQ|peGiSV~HUV!eHg4G%L
zfcBbtlA-59pS2TNE)j|m+t26}QM&gj*wiqpU}K}TOvu84t-n#@h-SJWR?o7g5|X=o
za%4Ruros*~YT}qc%tc=-<ZXygbk5?~VM61Yy~}{UnBB~1=mhl^&@7RPPHJO-L#rFB
zQDKi^42yWOn{_v=;5Ti)L$i-kjp*-VoqrSez!x{ddw&TZeK(IhbMmKsAEusIJ{QCy
z(X;ody{ia?a-pmqFB14EYI>p>VB|)l0<JgjxpsZa_^t+|L^G%xKK>|^JNWsryB2=^
z<h|viOd=4%A!9>4@cCtu4nsC!eLv50$*v?M@zN;i(E|SlxN$sp1~U)-?|5A94B-9&
L6a7%sCk+4qeN-(*

literal 1682
zcmV;D25tEtiwFP!000000G(G|h#W-_uG{2t|NlSvAr}RsVnTKaY7`E<+}*m#hRNlU
zI}_OzN#9KGZs%ruCfz-^dl8hNMkVn<(FY{~1q~4s6<<VOB!~ojQ}HDTY6KsQN(8|N
z!LxdPc6RSBMjY7P>H7Jqx~jUm_TsP5ECp<%mY0-@@PZ?(B)8q9X%i_~VKHeiPKlTd
zmW&GTj~ACtzusF(Kywb(F|MfK26PrEQJ$b#ZqcfO5d#5J5nu`dQ=VdW%4d%n&~B4C
zYWu6>@FX<lp<R^(l_kfvq=G~~orjhRWyO-h<(hmxU3CHdylsV_8aBC?$dV7X@A{eS
zOT&MCn}D_Gob8zN)H0xjX&yzS9^)8UAb~IYiHn}JICICSsjc+Z@eC#vGTS6A6Sw3D
zs^IoIw&`%w6M}M;nu#Pv0}uPORO}TRX>PfC`Lp4pU#@{f1~={yD??1>h%B_#_FQ$x
zc7y*^TJft_zf}=!AVwP(f-Gvkswcl1dF={0w+5QBuvW4Y6fJAbwUO4ASS$84S@ZPk
zJsYq5{lfX*63}EIz#=yl+<A-LZJj|e+@h|Pe@EXs<?sSwLMvQ7dwAEI!_WVeK)(ds
zaSKJQ(R;>w`ZD=wDA2fnRSlh1nH-;!N?;%ldeozq)-4H#V>CqJkqeXF*r6X2IzY`>
zEs7%6uK$2_pu)+VP3yt#{qFsbFO{8lv_m?~O;Q(ZGx$I4Vb(fpSxYEr5!9s2RkdQU
z2`?tIcu<N|^j~}9tv^pb_h8`L0<k^qTW>5k<(22CICb&Wk&aisKV7q`bP+E&%%r~I
zjX|pQYp1sA=RTpAoAuX*fYt4P_i24k=tK3;`7?v>nz=0zYyy@&D5ekFl}jB<R&nX!
zNKVrq$Qw6r5|fVYm3f*d5;uyi8JiU4=(C@{eBseS`av8(0r&kMq;p@KJ@xzJzqBVH
zpM`Y?r*!4xA)jMJQY#HTbBuFp9o8Y0Q)DU!9VRhLbZU<vWtoLRQ#ev0n^{CLhi7!F
z$wH6Cq-&E!YK4}eg}Rd24$r|BT4idyfE{jD5XKoVX8bP4PWT8p=nA`D#t~(rvM>~`
z_oHyxE6;eY>j;&G;gEjB?`X5psn0=cf!UOkGR;D75h>$bR;UO($wGHgIG&4U146Cc
z=Ae_wLzLM5W)Y_4m?srRFAdH1PC<)lX!U=~q*{Ar2-sz_GE>1OsLVm1?G$HRo2et1
zhe^ld3ZapP+vO5-Q_ev*E}5z$wAl{2XkyTF&{KPlqTGol{da@%B5n$o8oQ3viJmk_
zFuqJ;&=t7PIQ`QESveR9$c<S{Q<F*jMbpj5*Il(AeVxak&6wC!qR91Pz$>%PJmq2X
z_Cbd%d4&R=cB(@-5U!$3>k7nhN`^tyTvp`hj7iXgAUmkjEgNE&ts~5oh>HQ*R3peE
zOqHf*Cqf^n!0M65MI6~QUI&g%acQDryC><mMO<a<%)y{u*mf;b)7V}yd&A4fvx2#{
zPHP)7CKxAT5mSWkekjAN=ioM^7*XDQ<qX0pgUG;9^m>KFGOH_@$Y@5!)I?xCD2E&j
z;rq0p1y-HG=(1`=*Rltv0<cwG=#ChX#-@m{8eG3u=SoEE^Hw_F<u&@OeVtKQS4BKr
z`rRuBw&UZ4QUeQA%sG-Ca21}PpnmZ7u0(k@pf~73dly|Qu$yq47w%p;0_m$rrh>vY
z{gR<#G!I=t=hDTmO8)edHDIuT7z!&$-VAlN;-p!ba(x(x9fJ=AIE|;=zZdSwLuY-@
z)U<>F1NF2JN3*sWFl5%wWH7=kj*B8X+#d`N`u<7<(L}@Trq-P-5#euXG7sG`1*<dg
z0qr&PBty@IK5M(7dk?YVJ`hp|Tp|=Bwx8!|qI5@8u(57{!FEThsgQ+ZTz|dBA<c9{
zte$19CM0+H<jALxI2iVoaTA9JVlMhxA#Z(r&a*|1eJ3=o**gsAi`k8gy8cjW42=@4
z3g^}g-V%`)(Rr><L&)1uiOPQhBW>aNZ^T-^g5MB~4$VGHHKM<-b^d+b1E1aqr-LPY
z^bJ1t%!wcNyf^s7^64QKiJsk0?OsJFlnZ6`5YmJnOVgVg0mg3BE8u$5u4~sfPwuEg
zYH9>^!^a<Zvim<dde_3wAHKbO;Ask_a28n~&xL;7B*U0xir>uhY@#ax3A|ehdIG_J
c3vL|H?ZIe-e@`Bh+e5s60M;VBa!(Ba0MQ0Ep#T5?

diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala
similarity index 86%
rename from modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala
rename to modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala
index f63a90ef..70746b66 100644
--- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala
+++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindTest.scala
@@ -13,7 +13,7 @@ import docspell.files.TestFiles
 
 import munit._
 
-class DateFindSpec extends FunSuite {
+class DateFindTest extends FunSuite {
 
   test("find simple dates") {
     val expect = Vector(
@@ -179,4 +179,29 @@ class DateFindSpec extends FunSuite {
     )
   }
 
+  test("find spanish dates") {
+    assertEquals(
+      DateFind
+        .findDates("México, Distrito Federal a 15 de Diciembre de 2011", Language.Spanish)
+        .toVector,
+      Vector(
+        NerDateLabel(
+          LocalDate.of(2011, 12, 15),
+          NerLabel("15 de Diciembre de 2011", NerTag.Date, 27, 50)
+        )
+      )
+    )
+    println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
+    assertEquals(
+      DateFind
+        .findDates("2021-11-19", Language.Spanish)
+        .toVector,
+      Vector(
+        NerDateLabel(
+          LocalDate.of(2021, 11, 19),
+          NerLabel("2021-11-19", NerTag.Date, 0, 10)
+        )
+      )
+    )
+  }
 }
diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala
index 35fde297..f8a3ff2b 100644
--- a/modules/common/src/main/scala/docspell/common/Language.scala
+++ b/modules/common/src/main/scala/docspell/common/Language.scala
@@ -30,7 +30,7 @@ object Language {
     override val allowsNLP = true
   }
   object NLPLanguage {
-    val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
+    val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French, Spanish)
   }
 
   case object German extends NLPLanguage {
@@ -53,11 +53,16 @@ object Language {
     val iso3 = "ita"
   }
 
-  case object Spanish extends Language {
+  case object Spanish extends NLPLanguage {
     val iso2 = "es"
     val iso3 = "spa"
   }
 
+  case object Hungarian extends Language {
+    val iso2 = "hu"
+    val iso3 = "hun"
+  }
+
   case object Portuguese extends Language {
     val iso2 = "pt"
     val iso3 = "por"
@@ -125,6 +130,7 @@ object Language {
       French,
       Italian,
       Spanish,
+      Hungarian,
       Dutch,
       Portuguese,
       Czech,
diff --git a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
index 1abf0cce..56ab9a75 100644
--- a/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
+++ b/modules/fts-solr/src/main/scala/docspell/ftssolr/SolrSetup.scala
@@ -127,7 +127,13 @@ object SolrSetup {
             "Add hebrew content field",
             addContentField(Language.Hebrew)
           ),
-          SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field")
+          SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field"),
+          SolrMigration[F](
+            19,
+            "Add hungarian",
+            addContentField(Language.Hungarian)
+          ),
+          SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
         )
 
       def addFolderField: F[Unit] =
diff --git a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
index 813abf88..a2ea3c16 100644
--- a/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
+++ b/modules/joex/src/main/scala/docspell/joex/process/ReProcessItem.scala
@@ -18,11 +18,11 @@ import docspell.joex.Config
 import docspell.joex.analysis.RegexNerFile
 import docspell.joex.scheduler.Context
 import docspell.joex.scheduler.Task
+import docspell.store.queries.QItem
 import docspell.store.records.RAttachment
 import docspell.store.records.RAttachmentSource
 import docspell.store.records.RCollective
 import docspell.store.records.RItem
-import docspell.store.queries.QItem
 
 object ReProcessItem {
   type Args = ReProcessItemArgs
diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.29.0__reset_classifier_file.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.29.0__reset_classifier_file.sql
new file mode 100644
index 00000000..92acd861
--- /dev/null
+++ b/modules/store/src/main/resources/db/migration/postgresql/V1.29.0__reset_classifier_file.sql
@@ -0,0 +1,21 @@
+CREATE TEMPORARY TABLE "temp_file_ids" (
+  cid varchar(254) not null,
+  file_id varchar(254) not null
+);
+
+INSERT INTO "temp_file_ids" SELECT "cid", "file_id" FROM "classifier_model";
+
+INSERT INTO "job"
+       SELECT md5(random()::text), 'learn-classifier', cid, '{"collective":"' || cid || '"}',
+              'new classifier', now(), 'docspell-system', 0, 'waiting', 0, 0
+       FROM "classifier_setting";
+
+DELETE FROM "classifier_model";
+
+DELETE FROM "filemeta"
+WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
+
+DELETE FROM "filechunk"
+WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
+
+DROP TABLE "temp_file_ids";
diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm
index 74479cd7..a42dd803 100644
--- a/modules/webapp/src/main/elm/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Data/Language.elm
@@ -31,6 +31,7 @@ type Language
     | Latvian
     | Japanese
     | Hebrew
+    | Hungarian
 
 
 fromString : String -> Maybe Language
@@ -86,6 +87,9 @@ fromString str =
     else if str == "heb" || str == "he" || str == "hebrew" then
         Just Hebrew
 
+    else if str == "hun" || str == "hu" || str == "hungarian" then
+        Just Hungarian
+
     else
         Nothing
 
@@ -144,6 +148,9 @@ toIso3 lang =
         Hebrew ->
             "heb"
 
+        Hungarian ->
+            "hun"
+
 
 all : List Language
 all =
@@ -164,4 +171,5 @@ all =
     , Latvian
     , Japanese
     , Hebrew
+    , Hungarian
     ]
diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm
index 93bcfe9c..5da90b73 100644
--- a/modules/webapp/src/main/elm/Messages/Data/Language.elm
+++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm
@@ -67,6 +67,9 @@ gb lang =
         Hebrew ->
             "Hebrew"
 
+        Hungarian ->
+            "Hungarian"
+
 
 de : Language -> String
 de lang =
@@ -121,3 +124,6 @@ de lang =
 
         Hebrew ->
             "Hebräisch"
+
+        Hungarian ->
+            "Ungarisch"
diff --git a/nix/module-joex.nix b/nix/module-joex.nix
index aefd6c4a..003ff7b7 100644
--- a/nix/module-joex.nix
+++ b/nix/module-joex.nix
@@ -914,7 +914,7 @@ in {
 
                       The full and basic variants rely on pre-build language models
                       that are available for only 3 lanugages at the moment: German,
-                      English and French.
+                      English, French and Spanish.
 
                       Memory usage varies greatly among the languages. German has
                       quite large models, that require about 1G heap. So joex should
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
index 5595f758..a487d08a 100644
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -40,7 +40,7 @@ object Dependencies {
   val ScalaJavaTimeVersion = "2.3.0"
   val ScodecBitsVersion = "1.1.29"
   val Slf4jVersion = "1.7.32"
-  val StanfordNlpVersion = "4.2.2"
+  val StanfordNlpVersion = "4.3.2"
   val TikaVersion = "2.1.0"
   val YamuscaVersion = "0.8.1"
   val SwaggerUIVersion = "4.1.0"
@@ -185,18 +185,16 @@ object Dependencies {
     )
   )
 
-  val stanfordNlpModels = Seq(
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
-      .classifier("models"),
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
-      .classifier("models-german"),
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
-      .classifier("models-french"),
-    ("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
-      .classifier(
-        "models-english"
-      )
-  )
+  val stanfordNlpModels = {
+    val artifact = "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion
+    Seq(
+      artifact.classifier("models"),
+      artifact.classifier("models-german"),
+      artifact.classifier("models-french"),
+      artifact.classifier("models-english"),
+      artifact.classifier("models-spanish")
+    )
+  }
 
   val tika = Seq(
     "org.apache.tika" % "tika-core" % TikaVersion
diff --git a/project/NerModelsPlugin.scala b/project/NerModelsPlugin.scala
index a2d60869..f2cae44a 100644
--- a/project/NerModelsPlugin.scala
+++ b/project/NerModelsPlugin.scala
@@ -67,18 +67,29 @@ object NerModelsPlugin extends AutoPlugin {
   }
 
   private val nerModels = List(
-    "german.distsim.crf.ser.gz",
+    // English
     "english.conll.4class.distsim.crf.ser.gz",
+    "regexner_caseless.tab",
+    "regexner_cased.tab",
+    "english-left3words-distsim.tagger",
+    "english-left3words-distsim.tagger.props",
+    // German
+    "german.distsim.crf.ser.gz",
+    "german-mwt.tsv",
+    "german-ud.tagger",
+    "german-ud.tagger.props",
+    // French
     "french-wikiner-4class.crf.ser.gz",
     "french-mwt-statistical.tsv",
     "french-mwt.tagger",
     "french-mwt.tsv",
-    "german-mwt.tsv",
-    "german-ud.tagger",
-    "german-ud.tagger.props",
     "french-ud.tagger",
     "french-ud.tagger.props",
-    "english-left3words-distsim.tagger",
-    "english-left3words-distsim.tagger.props"
+    // Spanish
+    "spanish.ancora.distsim.s512.crf.ser.gz",
+    "spanish-mwt.tsv",
+    "spanish-ud.tagger",
+    "kbp_regexner_number_sp.tag",
+    "kbp_regexner_mapping_sp.tag"
   )
 }
diff --git a/website/site/content/docs/configure/_index.md b/website/site/content/docs/configure/_index.md
index 5eef1bb5..83d27088 100644
--- a/website/site/content/docs/configure/_index.md
+++ b/website/site/content/docs/configure/_index.md
@@ -486,8 +486,8 @@ This setting defines which NLP mode to use. It defaults to `full`,
 which requires more memory for certain languages (with the advantage
 of better results). Other values are `basic`, `regexonly` and
 `disabled`. The modes `full` and `basic` use pre-defined lanugage
-models for procesing documents of languaes German, English and French.
-These require some amount of memory (see below).
+models for procesing documents of languaes German, English, French and
+Spanish. These require some amount of memory (see below).
 
 The mode `basic` is like the "light" variant to `full`. It doesn't use
 all NLP features, which makes memory consumption much lower, but comes
diff --git a/website/site/content/docs/joex/file-processing.md b/website/site/content/docs/joex/file-processing.md
index 5ab0e0b1..360412db 100644
--- a/website/site/content/docs/joex/file-processing.md
+++ b/website/site/content/docs/joex/file-processing.md
@@ -8,10 +8,10 @@ mktoc = true
 +++
 
 When uploading a file, it is only saved to the database together with
-the given meta information. The file is not visible in the ui yet.
-Then joex takes the next such file (or files in case you uploaded
-many) and starts processing it. When processing finished, the item and
-its files will show up in the ui.
+the given meta information as a "job". The file is not visible in the
+ui yet. Then joex takes the next such job and starts processing it.
+When processing finished, the item and its files will show up in the
+ui.
 
 If an error occurs during processing, the item will be created
 anyways, so you can see it. Depending on the error, some information
@@ -400,7 +400,7 @@ names etc. This also requires a statistical model, but this time for a
 whole language. These are also provided by [Stanford
 NLP](https://nlp.stanford.edu/software/), but not for all languages.
 So whether this can be used depends on the document language. Models
-exist for German, English and French currently.
+exist for German, English, French and Spanish currently.
 
 Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to
 run custom rules against a text. This can be used as a fallback for
diff --git a/website/site/content/docs/webapp/metadata.md b/website/site/content/docs/webapp/metadata.md
index fb096641..3f97e29e 100644
--- a/website/site/content/docs/webapp/metadata.md
+++ b/website/site/content/docs/webapp/metadata.md
@@ -147,11 +147,11 @@ experience. The features of text analysis strongly depend on the
 language. Docspell uses the [Stanford NLP
 Library](https://nlp.stanford.edu/software/) for its great machine
 learning algorithms. Some of them, like certain NLP features, are only
-available for some languages – namely German, English and French. The
-reason is that the required statistical models are not available for
-other languages. However, docspell can still run other algorithms for
-the other languages, like classification and custom rules based on the
-address book.
+available for some languages – namely German, English, French and
+Spanish. The reason is that the required statistical models are not
+available for other languages. However, docspell can still run other
+algorithms for the other languages, like classification and custom
+rules based on the address book.
 
 More information about file processing and text analysis can be found
 [here](@/docs/joex/file-processing.md#text-analysis).