mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-06 15:15:58 +00:00
Merge pull request #1190 from eikek/update-stanford-core-nlp
Update stanford core nlp
This commit is contained in:
commit
aecc689240
@ -45,15 +45,16 @@ object DateFind {
|
|||||||
private[this] val jpnChars =
|
private[this] val jpnChars =
|
||||||
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
||||||
|
|
||||||
private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||||
val stext =
|
val stext =
|
||||||
if (lang == Language.Japanese) {
|
if (lang == Language.Japanese) {
|
||||||
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
||||||
} else text
|
} else text
|
||||||
|
|
||||||
TextSplitter
|
TextSplitter
|
||||||
.splitToken(stext, " \t.,\n\r/年月日".toSet)
|
.splitToken(stext, " -\t.,\n\r/年月日".toSet)
|
||||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||||
|
.filter(w => lang != Language.Spanish || w.value != "de")
|
||||||
}
|
}
|
||||||
|
|
||||||
case class SimpleDate(year: Int, month: Int, day: Int) {
|
case class SimpleDate(year: Int, month: Int, day: Int) {
|
||||||
@ -91,6 +92,7 @@ object DateFind {
|
|||||||
case Language.French => dmy.or(ymd).or(mdy)
|
case Language.French => dmy.or(ymd).or(mdy)
|
||||||
case Language.Italian => dmy.or(ymd).or(mdy)
|
case Language.Italian => dmy.or(ymd).or(mdy)
|
||||||
case Language.Spanish => dmy.or(ymd).or(mdy)
|
case Language.Spanish => dmy.or(ymd).or(mdy)
|
||||||
|
case Language.Hungarian => ymd
|
||||||
case Language.Czech => dmy.or(ymd).or(mdy)
|
case Language.Czech => dmy.or(ymd).or(mdy)
|
||||||
case Language.Danish => dmy.or(ymd).or(mdy)
|
case Language.Danish => dmy.or(ymd).or(mdy)
|
||||||
case Language.Finnish => dmy.or(ymd).or(mdy)
|
case Language.Finnish => dmy.or(ymd).or(mdy)
|
||||||
|
@ -30,6 +30,8 @@ object MonthName {
|
|||||||
italian
|
italian
|
||||||
case Language.Spanish =>
|
case Language.Spanish =>
|
||||||
spanish
|
spanish
|
||||||
|
case Language.Hungarian =>
|
||||||
|
hungarian
|
||||||
case Language.Swedish =>
|
case Language.Swedish =>
|
||||||
swedish
|
swedish
|
||||||
case Language.Norwegian =>
|
case Language.Norwegian =>
|
||||||
@ -324,4 +326,19 @@ object MonthName {
|
|||||||
List("11", "נובמבר"),
|
List("11", "נובמבר"),
|
||||||
List("12", "דצמבר")
|
List("12", "דצמבר")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private val hungarian = List(
|
||||||
|
List("I", "jan", "január"),
|
||||||
|
List("II", "febr", "február"),
|
||||||
|
List("III", "márc", "március"),
|
||||||
|
List("IV", "ápr", "április"),
|
||||||
|
List("V", "máj", "május"),
|
||||||
|
List("VI", "jún", "június"),
|
||||||
|
List("VII", "júl", "július"),
|
||||||
|
List("VIII", "aug", "augusztus"),
|
||||||
|
List("IX", "szept", "szeptember"),
|
||||||
|
List("X", "okt", "október"),
|
||||||
|
List("XI", "nov", "november"),
|
||||||
|
List("XII", "dec", "december")
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -29,7 +29,7 @@ object BasicCRFAnnotator {
|
|||||||
private[this] val logger = getLogger
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
// assert correct resource names
|
// assert correct resource names
|
||||||
List(Language.French, Language.German, Language.English).foreach(classifierResource)
|
NLPLanguage.all.toList.foreach(classifierResource)
|
||||||
|
|
||||||
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
||||||
|
|
||||||
@ -70,6 +70,12 @@ object BasicCRFAnnotator {
|
|||||||
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
|
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
|
||||||
case Language.English =>
|
case Language.English =>
|
||||||
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||||
|
case Language.Spanish =>
|
||||||
|
"/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
|
||||||
|
// case Language.Italian =>
|
||||||
|
// "/edu/stanford/nlp/models/ner/italian.crf.ser.gz"
|
||||||
|
// case Language.Hungarian =>
|
||||||
|
// "/edu/stanford/nlp/models/ner/hungarian.crf.ser.gz"
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -77,12 +83,14 @@ object BasicCRFAnnotator {
|
|||||||
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
||||||
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
|
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
|
||||||
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
||||||
|
private[this] lazy val spanishNerClassifier = makeAnnotator(Language.Spanish)
|
||||||
|
|
||||||
def forLang(language: NLPLanguage): Annotator =
|
def forLang(language: NLPLanguage): Annotator =
|
||||||
language match {
|
language match {
|
||||||
case Language.French => frenchNerClassifier
|
case Language.French => frenchNerClassifier
|
||||||
case Language.German => germanNerClassifier
|
case Language.German => germanNerClassifier
|
||||||
case Language.English => englishNerClassifier
|
case Language.English => englishNerClassifier
|
||||||
|
case Language.Spanish => spanishNerClassifier
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,6 +37,8 @@ object Properties {
|
|||||||
Properties.nerEnglish(regexNerFile)
|
Properties.nerEnglish(regexNerFile)
|
||||||
case Language.French =>
|
case Language.French =>
|
||||||
Properties.nerFrench(regexNerFile, highRecall)
|
Properties.nerFrench(regexNerFile, highRecall)
|
||||||
|
case Language.Spanish =>
|
||||||
|
Properties.nerSpanish(regexNerFile, highRecall)
|
||||||
}
|
}
|
||||||
case StanfordNerSettings.RegexOnly(path) =>
|
case StanfordNerSettings.RegexOnly(path) =>
|
||||||
Properties.regexNerOnly(path)
|
Properties.regexNerOnly(path)
|
||||||
@ -88,6 +90,18 @@ object Properties {
|
|||||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||||
|
|
||||||
|
def nerSpanish(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||||
|
Properties(
|
||||||
|
"annotators" -> "tokenize, ssplit, mwt, pos, lemma, ner",
|
||||||
|
"tokenize.language" -> "es",
|
||||||
|
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv",
|
||||||
|
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger",
|
||||||
|
"ner.model" -> "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz",
|
||||||
|
"ner.applyNumericClassifiers" -> "true",
|
||||||
|
"ner.useSUTime" -> "false",
|
||||||
|
"ner.language" -> "es"
|
||||||
|
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||||
|
|
||||||
def regexNerOnly(regexNerMappingFile: Path): JProps =
|
def regexNerOnly(regexNerMappingFile: Path): JProps =
|
||||||
Properties(
|
Properties(
|
||||||
"annotators" -> "tokenize,ssplit"
|
"annotators" -> "tokenize,ssplit"
|
||||||
|
Binary file not shown.
@ -13,7 +13,7 @@ import docspell.files.TestFiles
|
|||||||
|
|
||||||
import munit._
|
import munit._
|
||||||
|
|
||||||
class DateFindSpec extends FunSuite {
|
class DateFindTest extends FunSuite {
|
||||||
|
|
||||||
test("find simple dates") {
|
test("find simple dates") {
|
||||||
val expect = Vector(
|
val expect = Vector(
|
||||||
@ -179,4 +179,29 @@ class DateFindSpec extends FunSuite {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("find spanish dates") {
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("México, Distrito Federal a 15 de Diciembre de 2011", Language.Spanish)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2011, 12, 15),
|
||||||
|
NerLabel("15 de Diciembre de 2011", NerTag.Date, 27, 50)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("2021-11-19", Language.Spanish)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2021, 11, 19),
|
||||||
|
NerLabel("2021-11-19", NerTag.Date, 0, 10)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
@ -30,7 +30,7 @@ object Language {
|
|||||||
override val allowsNLP = true
|
override val allowsNLP = true
|
||||||
}
|
}
|
||||||
object NLPLanguage {
|
object NLPLanguage {
|
||||||
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
|
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French, Spanish)
|
||||||
}
|
}
|
||||||
|
|
||||||
case object German extends NLPLanguage {
|
case object German extends NLPLanguage {
|
||||||
@ -53,11 +53,16 @@ object Language {
|
|||||||
val iso3 = "ita"
|
val iso3 = "ita"
|
||||||
}
|
}
|
||||||
|
|
||||||
case object Spanish extends Language {
|
case object Spanish extends NLPLanguage {
|
||||||
val iso2 = "es"
|
val iso2 = "es"
|
||||||
val iso3 = "spa"
|
val iso3 = "spa"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case object Hungarian extends Language {
|
||||||
|
val iso2 = "hu"
|
||||||
|
val iso3 = "hun"
|
||||||
|
}
|
||||||
|
|
||||||
case object Portuguese extends Language {
|
case object Portuguese extends Language {
|
||||||
val iso2 = "pt"
|
val iso2 = "pt"
|
||||||
val iso3 = "por"
|
val iso3 = "por"
|
||||||
@ -125,6 +130,7 @@ object Language {
|
|||||||
French,
|
French,
|
||||||
Italian,
|
Italian,
|
||||||
Spanish,
|
Spanish,
|
||||||
|
Hungarian,
|
||||||
Dutch,
|
Dutch,
|
||||||
Portuguese,
|
Portuguese,
|
||||||
Czech,
|
Czech,
|
||||||
|
@ -127,7 +127,13 @@ object SolrSetup {
|
|||||||
"Add hebrew content field",
|
"Add hebrew content field",
|
||||||
addContentField(Language.Hebrew)
|
addContentField(Language.Hebrew)
|
||||||
),
|
),
|
||||||
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field")
|
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field"),
|
||||||
|
SolrMigration[F](
|
||||||
|
19,
|
||||||
|
"Add hungarian",
|
||||||
|
addContentField(Language.Hungarian)
|
||||||
|
),
|
||||||
|
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
|
||||||
)
|
)
|
||||||
|
|
||||||
def addFolderField: F[Unit] =
|
def addFolderField: F[Unit] =
|
||||||
|
@ -18,6 +18,7 @@ import docspell.joex.Config
|
|||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
|
import docspell.store.queries.QItem
|
||||||
import docspell.store.records.RAttachment
|
import docspell.store.records.RAttachment
|
||||||
import docspell.store.records.RAttachmentSource
|
import docspell.store.records.RAttachmentSource
|
||||||
import docspell.store.records.RCollective
|
import docspell.store.records.RCollective
|
||||||
@ -131,10 +132,13 @@ object ReProcessItem {
|
|||||||
|
|
||||||
def getLanguage[F[_]: Sync]: Task[F, Args, Language] =
|
def getLanguage[F[_]: Sync]: Task[F, Args, Language] =
|
||||||
Task { ctx =>
|
Task { ctx =>
|
||||||
(for {
|
val lang1 = OptionT(
|
||||||
coll <- OptionT(ctx.store.transact(RCollective.findByItem(ctx.args.itemId)))
|
ctx.store.transact(QItem.getItemLanguage(ctx.args.itemId)).map(_.headOption)
|
||||||
lang = coll.language
|
)
|
||||||
} yield lang).getOrElse(Language.German)
|
val lang2 = OptionT(ctx.store.transact(RCollective.findByItem(ctx.args.itemId)))
|
||||||
|
.map(_.language)
|
||||||
|
|
||||||
|
lang1.orElse(lang2).getOrElse(Language.German)
|
||||||
}
|
}
|
||||||
|
|
||||||
def isLastRetry[F[_]: Sync]: Task[F, Args, Boolean] =
|
def isLastRetry[F[_]: Sync]: Task[F, Args, Boolean] =
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
CREATE TEMPORARY TABLE "temp_file_ids" (
|
||||||
|
cid varchar(254) not null,
|
||||||
|
file_id varchar(254) not null
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO "temp_file_ids" SELECT "cid", "file_id" FROM "classifier_model";
|
||||||
|
|
||||||
|
INSERT INTO "job"
|
||||||
|
SELECT md5(random()::text), 'learn-classifier', cid, '{"collective":"' || cid || '"}',
|
||||||
|
'new classifier', now(), 'docspell-system', 0, 'waiting', 0, 0
|
||||||
|
FROM "classifier_setting";
|
||||||
|
|
||||||
|
DELETE FROM "classifier_model";
|
||||||
|
|
||||||
|
DELETE FROM "filemeta"
|
||||||
|
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
|
||||||
|
|
||||||
|
DELETE FROM "filechunk"
|
||||||
|
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
|
||||||
|
|
||||||
|
DROP TABLE "temp_file_ids";
|
@ -714,4 +714,13 @@ object QItem {
|
|||||||
txt = texts.map(_._1).mkString(pageSep)
|
txt = texts.map(_._1).mkString(pageSep)
|
||||||
} yield TextAndTag(itemId, txt, tag)
|
} yield TextAndTag(itemId, txt, tag)
|
||||||
|
|
||||||
|
/** Gets the language of the first attachment of the given item. */
|
||||||
|
def getItemLanguage(itemId: Ident): ConnectionIO[List[Language]] =
|
||||||
|
Select(
|
||||||
|
select(m.language),
|
||||||
|
from(m)
|
||||||
|
.innerJoin(a, a.id === m.id)
|
||||||
|
.innerJoin(i, i.id === a.itemId),
|
||||||
|
i.id === itemId
|
||||||
|
).orderBy(a.position.asc).build.query[Language].to[List]
|
||||||
}
|
}
|
||||||
|
@ -31,6 +31,7 @@ type Language
|
|||||||
| Latvian
|
| Latvian
|
||||||
| Japanese
|
| Japanese
|
||||||
| Hebrew
|
| Hebrew
|
||||||
|
| Hungarian
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -86,6 +87,9 @@ fromString str =
|
|||||||
else if str == "heb" || str == "he" || str == "hebrew" then
|
else if str == "heb" || str == "he" || str == "hebrew" then
|
||||||
Just Hebrew
|
Just Hebrew
|
||||||
|
|
||||||
|
else if str == "hun" || str == "hu" || str == "hungarian" then
|
||||||
|
Just Hungarian
|
||||||
|
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -144,6 +148,9 @@ toIso3 lang =
|
|||||||
Hebrew ->
|
Hebrew ->
|
||||||
"heb"
|
"heb"
|
||||||
|
|
||||||
|
Hungarian ->
|
||||||
|
"hun"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
@ -164,4 +171,5 @@ all =
|
|||||||
, Latvian
|
, Latvian
|
||||||
, Japanese
|
, Japanese
|
||||||
, Hebrew
|
, Hebrew
|
||||||
|
, Hungarian
|
||||||
]
|
]
|
||||||
|
@ -67,6 +67,9 @@ gb lang =
|
|||||||
Hebrew ->
|
Hebrew ->
|
||||||
"Hebrew"
|
"Hebrew"
|
||||||
|
|
||||||
|
Hungarian ->
|
||||||
|
"Hungarian"
|
||||||
|
|
||||||
|
|
||||||
de : Language -> String
|
de : Language -> String
|
||||||
de lang =
|
de lang =
|
||||||
@ -121,3 +124,6 @@ de lang =
|
|||||||
|
|
||||||
Hebrew ->
|
Hebrew ->
|
||||||
"Hebräisch"
|
"Hebräisch"
|
||||||
|
|
||||||
|
Hungarian ->
|
||||||
|
"Ungarisch"
|
||||||
|
@ -914,7 +914,7 @@ in {
|
|||||||
|
|
||||||
The full and basic variants rely on pre-build language models
|
The full and basic variants rely on pre-build language models
|
||||||
that are available for only 3 lanugages at the moment: German,
|
that are available for only 3 lanugages at the moment: German,
|
||||||
English and French.
|
English, French and Spanish.
|
||||||
|
|
||||||
Memory usage varies greatly among the languages. German has
|
Memory usage varies greatly among the languages. German has
|
||||||
quite large models, that require about 1G heap. So joex should
|
quite large models, that require about 1G heap. So joex should
|
||||||
|
@ -40,7 +40,7 @@ object Dependencies {
|
|||||||
val ScalaJavaTimeVersion = "2.3.0"
|
val ScalaJavaTimeVersion = "2.3.0"
|
||||||
val ScodecBitsVersion = "1.1.29"
|
val ScodecBitsVersion = "1.1.29"
|
||||||
val Slf4jVersion = "1.7.32"
|
val Slf4jVersion = "1.7.32"
|
||||||
val StanfordNlpVersion = "4.2.2"
|
val StanfordNlpVersion = "4.3.2"
|
||||||
val TikaVersion = "2.1.0"
|
val TikaVersion = "2.1.0"
|
||||||
val YamuscaVersion = "0.8.1"
|
val YamuscaVersion = "0.8.1"
|
||||||
val SwaggerUIVersion = "4.1.0"
|
val SwaggerUIVersion = "4.1.0"
|
||||||
@ -185,18 +185,16 @@ object Dependencies {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
val stanfordNlpModels = Seq(
|
val stanfordNlpModels = {
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
val artifact = "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion
|
||||||
.classifier("models"),
|
Seq(
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
artifact.classifier("models"),
|
||||||
.classifier("models-german"),
|
artifact.classifier("models-german"),
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
artifact.classifier("models-french"),
|
||||||
.classifier("models-french"),
|
artifact.classifier("models-english"),
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
artifact.classifier("models-spanish")
|
||||||
.classifier(
|
)
|
||||||
"models-english"
|
}
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
val tika = Seq(
|
val tika = Seq(
|
||||||
"org.apache.tika" % "tika-core" % TikaVersion
|
"org.apache.tika" % "tika-core" % TikaVersion
|
||||||
|
@ -67,18 +67,29 @@ object NerModelsPlugin extends AutoPlugin {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private val nerModels = List(
|
private val nerModels = List(
|
||||||
"german.distsim.crf.ser.gz",
|
// English
|
||||||
"english.conll.4class.distsim.crf.ser.gz",
|
"english.conll.4class.distsim.crf.ser.gz",
|
||||||
|
"regexner_caseless.tab",
|
||||||
|
"regexner_cased.tab",
|
||||||
|
"english-left3words-distsim.tagger",
|
||||||
|
"english-left3words-distsim.tagger.props",
|
||||||
|
// German
|
||||||
|
"german.distsim.crf.ser.gz",
|
||||||
|
"german-mwt.tsv",
|
||||||
|
"german-ud.tagger",
|
||||||
|
"german-ud.tagger.props",
|
||||||
|
// French
|
||||||
"french-wikiner-4class.crf.ser.gz",
|
"french-wikiner-4class.crf.ser.gz",
|
||||||
"french-mwt-statistical.tsv",
|
"french-mwt-statistical.tsv",
|
||||||
"french-mwt.tagger",
|
"french-mwt.tagger",
|
||||||
"french-mwt.tsv",
|
"french-mwt.tsv",
|
||||||
"german-mwt.tsv",
|
|
||||||
"german-ud.tagger",
|
|
||||||
"german-ud.tagger.props",
|
|
||||||
"french-ud.tagger",
|
"french-ud.tagger",
|
||||||
"french-ud.tagger.props",
|
"french-ud.tagger.props",
|
||||||
"english-left3words-distsim.tagger",
|
// Spanish
|
||||||
"english-left3words-distsim.tagger.props"
|
"spanish.ancora.distsim.s512.crf.ser.gz",
|
||||||
|
"spanish-mwt.tsv",
|
||||||
|
"spanish-ud.tagger",
|
||||||
|
"kbp_regexner_number_sp.tag",
|
||||||
|
"kbp_regexner_mapping_sp.tag"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -486,8 +486,8 @@ This setting defines which NLP mode to use. It defaults to `full`,
|
|||||||
which requires more memory for certain languages (with the advantage
|
which requires more memory for certain languages (with the advantage
|
||||||
of better results). Other values are `basic`, `regexonly` and
|
of better results). Other values are `basic`, `regexonly` and
|
||||||
`disabled`. The modes `full` and `basic` use pre-defined lanugage
|
`disabled`. The modes `full` and `basic` use pre-defined lanugage
|
||||||
models for procesing documents of languaes German, English and French.
|
models for procesing documents of languaes German, English, French and
|
||||||
These require some amount of memory (see below).
|
Spanish. These require some amount of memory (see below).
|
||||||
|
|
||||||
The mode `basic` is like the "light" variant to `full`. It doesn't use
|
The mode `basic` is like the "light" variant to `full`. It doesn't use
|
||||||
all NLP features, which makes memory consumption much lower, but comes
|
all NLP features, which makes memory consumption much lower, but comes
|
||||||
|
@ -8,10 +8,10 @@ mktoc = true
|
|||||||
+++
|
+++
|
||||||
|
|
||||||
When uploading a file, it is only saved to the database together with
|
When uploading a file, it is only saved to the database together with
|
||||||
the given meta information. The file is not visible in the ui yet.
|
the given meta information as a "job". The file is not visible in the
|
||||||
Then joex takes the next such file (or files in case you uploaded
|
ui yet. Then joex takes the next such job and starts processing it.
|
||||||
many) and starts processing it. When processing finished, the item and
|
When processing finished, the item and its files will show up in the
|
||||||
its files will show up in the ui.
|
ui.
|
||||||
|
|
||||||
If an error occurs during processing, the item will be created
|
If an error occurs during processing, the item will be created
|
||||||
anyways, so you can see it. Depending on the error, some information
|
anyways, so you can see it. Depending on the error, some information
|
||||||
@ -400,7 +400,7 @@ names etc. This also requires a statistical model, but this time for a
|
|||||||
whole language. These are also provided by [Stanford
|
whole language. These are also provided by [Stanford
|
||||||
NLP](https://nlp.stanford.edu/software/), but not for all languages.
|
NLP](https://nlp.stanford.edu/software/), but not for all languages.
|
||||||
So whether this can be used depends on the document language. Models
|
So whether this can be used depends on the document language. Models
|
||||||
exist for German, English and French currently.
|
exist for German, English, French and Spanish currently.
|
||||||
|
|
||||||
Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to
|
Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to
|
||||||
run custom rules against a text. This can be used as a fallback for
|
run custom rules against a text. This can be used as a fallback for
|
||||||
|
@ -147,11 +147,11 @@ experience. The features of text analysis strongly depend on the
|
|||||||
language. Docspell uses the [Stanford NLP
|
language. Docspell uses the [Stanford NLP
|
||||||
Library](https://nlp.stanford.edu/software/) for its great machine
|
Library](https://nlp.stanford.edu/software/) for its great machine
|
||||||
learning algorithms. Some of them, like certain NLP features, are only
|
learning algorithms. Some of them, like certain NLP features, are only
|
||||||
available for some languages – namely German, English and French. The
|
available for some languages – namely German, English, French and
|
||||||
reason is that the required statistical models are not available for
|
Spanish. The reason is that the required statistical models are not
|
||||||
other languages. However, docspell can still run other algorithms for
|
available for other languages. However, docspell can still run other
|
||||||
the other languages, like classification and custom rules based on the
|
algorithms for the other languages, like classification and custom
|
||||||
address book.
|
rules based on the address book.
|
||||||
|
|
||||||
More information about file processing and text analysis can be found
|
More information about file processing and text analysis can be found
|
||||||
[here](@/docs/joex/file-processing.md#text-analysis).
|
[here](@/docs/joex/file-processing.md#text-analysis).
|
||||||
|
Loading…
x
Reference in New Issue
Block a user