mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-04-04 10:29:34 +00:00
Merge pull request #1190 from eikek/update-stanford-core-nlp
Update stanford core nlp
This commit is contained in:
commit
aecc689240
@ -45,15 +45,16 @@ object DateFind {
|
||||
private[this] val jpnChars =
|
||||
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
||||
|
||||
private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||
val stext =
|
||||
if (lang == Language.Japanese) {
|
||||
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
||||
} else text
|
||||
|
||||
TextSplitter
|
||||
.splitToken(stext, " \t.,\n\r/年月日".toSet)
|
||||
.splitToken(stext, " -\t.,\n\r/年月日".toSet)
|
||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||
.filter(w => lang != Language.Spanish || w.value != "de")
|
||||
}
|
||||
|
||||
case class SimpleDate(year: Int, month: Int, day: Int) {
|
||||
@ -91,6 +92,7 @@ object DateFind {
|
||||
case Language.French => dmy.or(ymd).or(mdy)
|
||||
case Language.Italian => dmy.or(ymd).or(mdy)
|
||||
case Language.Spanish => dmy.or(ymd).or(mdy)
|
||||
case Language.Hungarian => ymd
|
||||
case Language.Czech => dmy.or(ymd).or(mdy)
|
||||
case Language.Danish => dmy.or(ymd).or(mdy)
|
||||
case Language.Finnish => dmy.or(ymd).or(mdy)
|
||||
|
@ -30,6 +30,8 @@ object MonthName {
|
||||
italian
|
||||
case Language.Spanish =>
|
||||
spanish
|
||||
case Language.Hungarian =>
|
||||
hungarian
|
||||
case Language.Swedish =>
|
||||
swedish
|
||||
case Language.Norwegian =>
|
||||
@ -324,4 +326,19 @@ object MonthName {
|
||||
List("11", "נובמבר"),
|
||||
List("12", "דצמבר")
|
||||
)
|
||||
|
||||
private val hungarian = List(
|
||||
List("I", "jan", "január"),
|
||||
List("II", "febr", "február"),
|
||||
List("III", "márc", "március"),
|
||||
List("IV", "ápr", "április"),
|
||||
List("V", "máj", "május"),
|
||||
List("VI", "jún", "június"),
|
||||
List("VII", "júl", "július"),
|
||||
List("VIII", "aug", "augusztus"),
|
||||
List("IX", "szept", "szeptember"),
|
||||
List("X", "okt", "október"),
|
||||
List("XI", "nov", "november"),
|
||||
List("XII", "dec", "december")
|
||||
)
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ object BasicCRFAnnotator {
|
||||
private[this] val logger = getLogger
|
||||
|
||||
// assert correct resource names
|
||||
List(Language.French, Language.German, Language.English).foreach(classifierResource)
|
||||
NLPLanguage.all.toList.foreach(classifierResource)
|
||||
|
||||
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
||||
|
||||
@ -70,6 +70,12 @@ object BasicCRFAnnotator {
|
||||
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
|
||||
case Language.English =>
|
||||
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
case Language.Spanish =>
|
||||
"/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
|
||||
// case Language.Italian =>
|
||||
// "/edu/stanford/nlp/models/ner/italian.crf.ser.gz"
|
||||
// case Language.Hungarian =>
|
||||
// "/edu/stanford/nlp/models/ner/hungarian.crf.ser.gz"
|
||||
})
|
||||
}
|
||||
|
||||
@ -77,12 +83,14 @@ object BasicCRFAnnotator {
|
||||
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
||||
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
|
||||
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
||||
private[this] lazy val spanishNerClassifier = makeAnnotator(Language.Spanish)
|
||||
|
||||
def forLang(language: NLPLanguage): Annotator =
|
||||
language match {
|
||||
case Language.French => frenchNerClassifier
|
||||
case Language.German => germanNerClassifier
|
||||
case Language.English => englishNerClassifier
|
||||
case Language.Spanish => spanishNerClassifier
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -37,6 +37,8 @@ object Properties {
|
||||
Properties.nerEnglish(regexNerFile)
|
||||
case Language.French =>
|
||||
Properties.nerFrench(regexNerFile, highRecall)
|
||||
case Language.Spanish =>
|
||||
Properties.nerSpanish(regexNerFile, highRecall)
|
||||
}
|
||||
case StanfordNerSettings.RegexOnly(path) =>
|
||||
Properties.regexNerOnly(path)
|
||||
@ -88,6 +90,18 @@ object Properties {
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
def nerSpanish(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize, ssplit, mwt, pos, lemma, ner",
|
||||
"tokenize.language" -> "es",
|
||||
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv",
|
||||
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger",
|
||||
"ner.model" -> "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz",
|
||||
"ner.applyNumericClassifiers" -> "true",
|
||||
"ner.useSUTime" -> "false",
|
||||
"ner.language" -> "es"
|
||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||
|
||||
def regexNerOnly(regexNerMappingFile: Path): JProps =
|
||||
Properties(
|
||||
"annotators" -> "tokenize,ssplit"
|
||||
|
Binary file not shown.
@ -13,7 +13,7 @@ import docspell.files.TestFiles
|
||||
|
||||
import munit._
|
||||
|
||||
class DateFindSpec extends FunSuite {
|
||||
class DateFindTest extends FunSuite {
|
||||
|
||||
test("find simple dates") {
|
||||
val expect = Vector(
|
||||
@ -179,4 +179,29 @@ class DateFindSpec extends FunSuite {
|
||||
)
|
||||
}
|
||||
|
||||
test("find spanish dates") {
|
||||
assertEquals(
|
||||
DateFind
|
||||
.findDates("México, Distrito Federal a 15 de Diciembre de 2011", Language.Spanish)
|
||||
.toVector,
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2011, 12, 15),
|
||||
NerLabel("15 de Diciembre de 2011", NerTag.Date, 27, 50)
|
||||
)
|
||||
)
|
||||
)
|
||||
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
|
||||
assertEquals(
|
||||
DateFind
|
||||
.findDates("2021-11-19", Language.Spanish)
|
||||
.toVector,
|
||||
Vector(
|
||||
NerDateLabel(
|
||||
LocalDate.of(2021, 11, 19),
|
||||
NerLabel("2021-11-19", NerTag.Date, 0, 10)
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
@ -30,7 +30,7 @@ object Language {
|
||||
override val allowsNLP = true
|
||||
}
|
||||
object NLPLanguage {
|
||||
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
|
||||
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French, Spanish)
|
||||
}
|
||||
|
||||
case object German extends NLPLanguage {
|
||||
@ -53,11 +53,16 @@ object Language {
|
||||
val iso3 = "ita"
|
||||
}
|
||||
|
||||
case object Spanish extends Language {
|
||||
case object Spanish extends NLPLanguage {
|
||||
val iso2 = "es"
|
||||
val iso3 = "spa"
|
||||
}
|
||||
|
||||
case object Hungarian extends Language {
|
||||
val iso2 = "hu"
|
||||
val iso3 = "hun"
|
||||
}
|
||||
|
||||
case object Portuguese extends Language {
|
||||
val iso2 = "pt"
|
||||
val iso3 = "por"
|
||||
@ -125,6 +130,7 @@ object Language {
|
||||
French,
|
||||
Italian,
|
||||
Spanish,
|
||||
Hungarian,
|
||||
Dutch,
|
||||
Portuguese,
|
||||
Czech,
|
||||
|
@ -127,7 +127,13 @@ object SolrSetup {
|
||||
"Add hebrew content field",
|
||||
addContentField(Language.Hebrew)
|
||||
),
|
||||
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field")
|
||||
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field"),
|
||||
SolrMigration[F](
|
||||
19,
|
||||
"Add hungarian",
|
||||
addContentField(Language.Hungarian)
|
||||
),
|
||||
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
|
||||
)
|
||||
|
||||
def addFolderField: F[Unit] =
|
||||
|
@ -18,6 +18,7 @@ import docspell.joex.Config
|
||||
import docspell.joex.analysis.RegexNerFile
|
||||
import docspell.joex.scheduler.Context
|
||||
import docspell.joex.scheduler.Task
|
||||
import docspell.store.queries.QItem
|
||||
import docspell.store.records.RAttachment
|
||||
import docspell.store.records.RAttachmentSource
|
||||
import docspell.store.records.RCollective
|
||||
@ -131,10 +132,13 @@ object ReProcessItem {
|
||||
|
||||
def getLanguage[F[_]: Sync]: Task[F, Args, Language] =
|
||||
Task { ctx =>
|
||||
(for {
|
||||
coll <- OptionT(ctx.store.transact(RCollective.findByItem(ctx.args.itemId)))
|
||||
lang = coll.language
|
||||
} yield lang).getOrElse(Language.German)
|
||||
val lang1 = OptionT(
|
||||
ctx.store.transact(QItem.getItemLanguage(ctx.args.itemId)).map(_.headOption)
|
||||
)
|
||||
val lang2 = OptionT(ctx.store.transact(RCollective.findByItem(ctx.args.itemId)))
|
||||
.map(_.language)
|
||||
|
||||
lang1.orElse(lang2).getOrElse(Language.German)
|
||||
}
|
||||
|
||||
def isLastRetry[F[_]: Sync]: Task[F, Args, Boolean] =
|
||||
|
@ -0,0 +1,21 @@
|
||||
CREATE TEMPORARY TABLE "temp_file_ids" (
|
||||
cid varchar(254) not null,
|
||||
file_id varchar(254) not null
|
||||
);
|
||||
|
||||
INSERT INTO "temp_file_ids" SELECT "cid", "file_id" FROM "classifier_model";
|
||||
|
||||
INSERT INTO "job"
|
||||
SELECT md5(random()::text), 'learn-classifier', cid, '{"collective":"' || cid || '"}',
|
||||
'new classifier', now(), 'docspell-system', 0, 'waiting', 0, 0
|
||||
FROM "classifier_setting";
|
||||
|
||||
DELETE FROM "classifier_model";
|
||||
|
||||
DELETE FROM "filemeta"
|
||||
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
|
||||
|
||||
DELETE FROM "filechunk"
|
||||
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
|
||||
|
||||
DROP TABLE "temp_file_ids";
|
@ -714,4 +714,13 @@ object QItem {
|
||||
txt = texts.map(_._1).mkString(pageSep)
|
||||
} yield TextAndTag(itemId, txt, tag)
|
||||
|
||||
/** Gets the language of the first attachment of the given item. */
|
||||
def getItemLanguage(itemId: Ident): ConnectionIO[List[Language]] =
|
||||
Select(
|
||||
select(m.language),
|
||||
from(m)
|
||||
.innerJoin(a, a.id === m.id)
|
||||
.innerJoin(i, i.id === a.itemId),
|
||||
i.id === itemId
|
||||
).orderBy(a.position.asc).build.query[Language].to[List]
|
||||
}
|
||||
|
@ -31,6 +31,7 @@ type Language
|
||||
| Latvian
|
||||
| Japanese
|
||||
| Hebrew
|
||||
| Hungarian
|
||||
|
||||
|
||||
fromString : String -> Maybe Language
|
||||
@ -86,6 +87,9 @@ fromString str =
|
||||
else if str == "heb" || str == "he" || str == "hebrew" then
|
||||
Just Hebrew
|
||||
|
||||
else if str == "hun" || str == "hu" || str == "hungarian" then
|
||||
Just Hungarian
|
||||
|
||||
else
|
||||
Nothing
|
||||
|
||||
@ -144,6 +148,9 @@ toIso3 lang =
|
||||
Hebrew ->
|
||||
"heb"
|
||||
|
||||
Hungarian ->
|
||||
"hun"
|
||||
|
||||
|
||||
all : List Language
|
||||
all =
|
||||
@ -164,4 +171,5 @@ all =
|
||||
, Latvian
|
||||
, Japanese
|
||||
, Hebrew
|
||||
, Hungarian
|
||||
]
|
||||
|
@ -67,6 +67,9 @@ gb lang =
|
||||
Hebrew ->
|
||||
"Hebrew"
|
||||
|
||||
Hungarian ->
|
||||
"Hungarian"
|
||||
|
||||
|
||||
de : Language -> String
|
||||
de lang =
|
||||
@ -121,3 +124,6 @@ de lang =
|
||||
|
||||
Hebrew ->
|
||||
"Hebräisch"
|
||||
|
||||
Hungarian ->
|
||||
"Ungarisch"
|
||||
|
@ -914,7 +914,7 @@ in {
|
||||
|
||||
The full and basic variants rely on pre-build language models
|
||||
that are available for only 3 lanugages at the moment: German,
|
||||
English and French.
|
||||
English, French and Spanish.
|
||||
|
||||
Memory usage varies greatly among the languages. German has
|
||||
quite large models, that require about 1G heap. So joex should
|
||||
|
@ -40,7 +40,7 @@ object Dependencies {
|
||||
val ScalaJavaTimeVersion = "2.3.0"
|
||||
val ScodecBitsVersion = "1.1.29"
|
||||
val Slf4jVersion = "1.7.32"
|
||||
val StanfordNlpVersion = "4.2.2"
|
||||
val StanfordNlpVersion = "4.3.2"
|
||||
val TikaVersion = "2.1.0"
|
||||
val YamuscaVersion = "0.8.1"
|
||||
val SwaggerUIVersion = "4.1.0"
|
||||
@ -185,18 +185,16 @@ object Dependencies {
|
||||
)
|
||||
)
|
||||
|
||||
val stanfordNlpModels = Seq(
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models-german"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier("models-french"),
|
||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
||||
.classifier(
|
||||
"models-english"
|
||||
)
|
||||
)
|
||||
val stanfordNlpModels = {
|
||||
val artifact = "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion
|
||||
Seq(
|
||||
artifact.classifier("models"),
|
||||
artifact.classifier("models-german"),
|
||||
artifact.classifier("models-french"),
|
||||
artifact.classifier("models-english"),
|
||||
artifact.classifier("models-spanish")
|
||||
)
|
||||
}
|
||||
|
||||
val tika = Seq(
|
||||
"org.apache.tika" % "tika-core" % TikaVersion
|
||||
|
@ -67,18 +67,29 @@ object NerModelsPlugin extends AutoPlugin {
|
||||
}
|
||||
|
||||
private val nerModels = List(
|
||||
"german.distsim.crf.ser.gz",
|
||||
// English
|
||||
"english.conll.4class.distsim.crf.ser.gz",
|
||||
"regexner_caseless.tab",
|
||||
"regexner_cased.tab",
|
||||
"english-left3words-distsim.tagger",
|
||||
"english-left3words-distsim.tagger.props",
|
||||
// German
|
||||
"german.distsim.crf.ser.gz",
|
||||
"german-mwt.tsv",
|
||||
"german-ud.tagger",
|
||||
"german-ud.tagger.props",
|
||||
// French
|
||||
"french-wikiner-4class.crf.ser.gz",
|
||||
"french-mwt-statistical.tsv",
|
||||
"french-mwt.tagger",
|
||||
"french-mwt.tsv",
|
||||
"german-mwt.tsv",
|
||||
"german-ud.tagger",
|
||||
"german-ud.tagger.props",
|
||||
"french-ud.tagger",
|
||||
"french-ud.tagger.props",
|
||||
"english-left3words-distsim.tagger",
|
||||
"english-left3words-distsim.tagger.props"
|
||||
// Spanish
|
||||
"spanish.ancora.distsim.s512.crf.ser.gz",
|
||||
"spanish-mwt.tsv",
|
||||
"spanish-ud.tagger",
|
||||
"kbp_regexner_number_sp.tag",
|
||||
"kbp_regexner_mapping_sp.tag"
|
||||
)
|
||||
}
|
||||
|
@ -486,8 +486,8 @@ This setting defines which NLP mode to use. It defaults to `full`,
|
||||
which requires more memory for certain languages (with the advantage
|
||||
of better results). Other values are `basic`, `regexonly` and
|
||||
`disabled`. The modes `full` and `basic` use pre-defined lanugage
|
||||
models for procesing documents of languaes German, English and French.
|
||||
These require some amount of memory (see below).
|
||||
models for procesing documents of languaes German, English, French and
|
||||
Spanish. These require some amount of memory (see below).
|
||||
|
||||
The mode `basic` is like the "light" variant to `full`. It doesn't use
|
||||
all NLP features, which makes memory consumption much lower, but comes
|
||||
|
@ -8,10 +8,10 @@ mktoc = true
|
||||
+++
|
||||
|
||||
When uploading a file, it is only saved to the database together with
|
||||
the given meta information. The file is not visible in the ui yet.
|
||||
Then joex takes the next such file (or files in case you uploaded
|
||||
many) and starts processing it. When processing finished, the item and
|
||||
its files will show up in the ui.
|
||||
the given meta information as a "job". The file is not visible in the
|
||||
ui yet. Then joex takes the next such job and starts processing it.
|
||||
When processing finished, the item and its files will show up in the
|
||||
ui.
|
||||
|
||||
If an error occurs during processing, the item will be created
|
||||
anyways, so you can see it. Depending on the error, some information
|
||||
@ -400,7 +400,7 @@ names etc. This also requires a statistical model, but this time for a
|
||||
whole language. These are also provided by [Stanford
|
||||
NLP](https://nlp.stanford.edu/software/), but not for all languages.
|
||||
So whether this can be used depends on the document language. Models
|
||||
exist for German, English and French currently.
|
||||
exist for German, English, French and Spanish currently.
|
||||
|
||||
Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to
|
||||
run custom rules against a text. This can be used as a fallback for
|
||||
|
@ -147,11 +147,11 @@ experience. The features of text analysis strongly depend on the
|
||||
language. Docspell uses the [Stanford NLP
|
||||
Library](https://nlp.stanford.edu/software/) for its great machine
|
||||
learning algorithms. Some of them, like certain NLP features, are only
|
||||
available for some languages – namely German, English and French. The
|
||||
reason is that the required statistical models are not available for
|
||||
other languages. However, docspell can still run other algorithms for
|
||||
the other languages, like classification and custom rules based on the
|
||||
address book.
|
||||
available for some languages – namely German, English, French and
|
||||
Spanish. The reason is that the required statistical models are not
|
||||
available for other languages. However, docspell can still run other
|
||||
algorithms for the other languages, like classification and custom
|
||||
rules based on the address book.
|
||||
|
||||
More information about file processing and text analysis can be found
|
||||
[here](@/docs/joex/file-processing.md#text-analysis).
|
||||
|
Loading…
x
Reference in New Issue
Block a user