mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-08-05 02:24:52 +00:00
Updating stanford corenlp to 4.3.2; adding more languages
There are models for Spanish, that have been added now. Also the Hungarian language has been added to the list of supported languages (for tesseract mainly, no nlp models)
This commit is contained in:
@ -45,15 +45,16 @@ object DateFind {
|
|||||||
private[this] val jpnChars =
|
private[this] val jpnChars =
|
||||||
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet
|
||||||
|
|
||||||
private def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
private[date] def splitWords(text: String, lang: Language): Stream[Pure, Word] = {
|
||||||
val stext =
|
val stext =
|
||||||
if (lang == Language.Japanese) {
|
if (lang == Language.Japanese) {
|
||||||
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
text.map(c => if (jpnChars.contains(c)) c else ' ')
|
||||||
} else text
|
} else text
|
||||||
|
|
||||||
TextSplitter
|
TextSplitter
|
||||||
.splitToken(stext, " \t.,\n\r/年月日".toSet)
|
.splitToken(stext, " -\t.,\n\r/年月日".toSet)
|
||||||
.filter(w => lang != Language.Latvian || w.value != "gada")
|
.filter(w => lang != Language.Latvian || w.value != "gada")
|
||||||
|
.filter(w => lang != Language.Spanish || w.value != "de")
|
||||||
}
|
}
|
||||||
|
|
||||||
case class SimpleDate(year: Int, month: Int, day: Int) {
|
case class SimpleDate(year: Int, month: Int, day: Int) {
|
||||||
@ -91,6 +92,7 @@ object DateFind {
|
|||||||
case Language.French => dmy.or(ymd).or(mdy)
|
case Language.French => dmy.or(ymd).or(mdy)
|
||||||
case Language.Italian => dmy.or(ymd).or(mdy)
|
case Language.Italian => dmy.or(ymd).or(mdy)
|
||||||
case Language.Spanish => dmy.or(ymd).or(mdy)
|
case Language.Spanish => dmy.or(ymd).or(mdy)
|
||||||
|
case Language.Hungarian => ymd
|
||||||
case Language.Czech => dmy.or(ymd).or(mdy)
|
case Language.Czech => dmy.or(ymd).or(mdy)
|
||||||
case Language.Danish => dmy.or(ymd).or(mdy)
|
case Language.Danish => dmy.or(ymd).or(mdy)
|
||||||
case Language.Finnish => dmy.or(ymd).or(mdy)
|
case Language.Finnish => dmy.or(ymd).or(mdy)
|
||||||
|
@ -30,6 +30,8 @@ object MonthName {
|
|||||||
italian
|
italian
|
||||||
case Language.Spanish =>
|
case Language.Spanish =>
|
||||||
spanish
|
spanish
|
||||||
|
case Language.Hungarian =>
|
||||||
|
hungarian
|
||||||
case Language.Swedish =>
|
case Language.Swedish =>
|
||||||
swedish
|
swedish
|
||||||
case Language.Norwegian =>
|
case Language.Norwegian =>
|
||||||
@ -324,4 +326,19 @@ object MonthName {
|
|||||||
List("11", "נובמבר"),
|
List("11", "נובמבר"),
|
||||||
List("12", "דצמבר")
|
List("12", "דצמבר")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
private val hungarian = List(
|
||||||
|
List("I", "jan", "január"),
|
||||||
|
List("II", "febr", "február"),
|
||||||
|
List("III", "márc", "március"),
|
||||||
|
List("IV", "ápr", "április"),
|
||||||
|
List("V", "máj", "május"),
|
||||||
|
List("VI", "jún", "június"),
|
||||||
|
List("VII", "júl", "július"),
|
||||||
|
List("VIII", "aug", "augusztus"),
|
||||||
|
List("IX", "szept", "szeptember"),
|
||||||
|
List("X", "okt", "október"),
|
||||||
|
List("XI", "nov", "november"),
|
||||||
|
List("XII", "dec", "december")
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
@ -29,7 +29,7 @@ object BasicCRFAnnotator {
|
|||||||
private[this] val logger = getLogger
|
private[this] val logger = getLogger
|
||||||
|
|
||||||
// assert correct resource names
|
// assert correct resource names
|
||||||
List(Language.French, Language.German, Language.English).foreach(classifierResource)
|
NLPLanguage.all.toList.foreach(classifierResource)
|
||||||
|
|
||||||
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
type Annotator = AbstractSequenceClassifier[CoreLabel]
|
||||||
|
|
||||||
@ -70,6 +70,12 @@ object BasicCRFAnnotator {
|
|||||||
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
|
"/edu/stanford/nlp/models/ner/german.distsim.crf.ser.gz"
|
||||||
case Language.English =>
|
case Language.English =>
|
||||||
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
"/edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||||
|
case Language.Spanish =>
|
||||||
|
"/edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz"
|
||||||
|
// case Language.Italian =>
|
||||||
|
// "/edu/stanford/nlp/models/ner/italian.crf.ser.gz"
|
||||||
|
// case Language.Hungarian =>
|
||||||
|
// "/edu/stanford/nlp/models/ner/hungarian.crf.ser.gz"
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -77,12 +83,14 @@ object BasicCRFAnnotator {
|
|||||||
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
private[this] lazy val germanNerClassifier = makeAnnotator(Language.German)
|
||||||
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
|
private[this] lazy val englishNerClassifier = makeAnnotator(Language.English)
|
||||||
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
private[this] lazy val frenchNerClassifier = makeAnnotator(Language.French)
|
||||||
|
private[this] lazy val spanishNerClassifier = makeAnnotator(Language.Spanish)
|
||||||
|
|
||||||
def forLang(language: NLPLanguage): Annotator =
|
def forLang(language: NLPLanguage): Annotator =
|
||||||
language match {
|
language match {
|
||||||
case Language.French => frenchNerClassifier
|
case Language.French => frenchNerClassifier
|
||||||
case Language.German => germanNerClassifier
|
case Language.German => germanNerClassifier
|
||||||
case Language.English => englishNerClassifier
|
case Language.English => englishNerClassifier
|
||||||
|
case Language.Spanish => spanishNerClassifier
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,6 +37,8 @@ object Properties {
|
|||||||
Properties.nerEnglish(regexNerFile)
|
Properties.nerEnglish(regexNerFile)
|
||||||
case Language.French =>
|
case Language.French =>
|
||||||
Properties.nerFrench(regexNerFile, highRecall)
|
Properties.nerFrench(regexNerFile, highRecall)
|
||||||
|
case Language.Spanish =>
|
||||||
|
Properties.nerSpanish(regexNerFile, highRecall)
|
||||||
}
|
}
|
||||||
case StanfordNerSettings.RegexOnly(path) =>
|
case StanfordNerSettings.RegexOnly(path) =>
|
||||||
Properties.regexNerOnly(path)
|
Properties.regexNerOnly(path)
|
||||||
@ -88,6 +90,18 @@ object Properties {
|
|||||||
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
"ner.model" -> "edu/stanford/nlp/models/ner/french-wikiner-4class.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
|
||||||
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||||
|
|
||||||
|
def nerSpanish(regexNerMappingFile: Option[String], highRecall: Boolean): JProps =
|
||||||
|
Properties(
|
||||||
|
"annotators" -> "tokenize, ssplit, mwt, pos, lemma, ner",
|
||||||
|
"tokenize.language" -> "es",
|
||||||
|
"mwt.mappingFile" -> "edu/stanford/nlp/models/mwt/spanish/spanish-mwt.tsv",
|
||||||
|
"pos.model" -> "edu/stanford/nlp/models/pos-tagger/spanish-ud.tagger",
|
||||||
|
"ner.model" -> "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz",
|
||||||
|
"ner.applyNumericClassifiers" -> "true",
|
||||||
|
"ner.useSUTime" -> "false",
|
||||||
|
"ner.language" -> "es"
|
||||||
|
).withRegexNer(regexNerMappingFile).withHighRecall(highRecall)
|
||||||
|
|
||||||
def regexNerOnly(regexNerMappingFile: Path): JProps =
|
def regexNerOnly(regexNerMappingFile: Path): JProps =
|
||||||
Properties(
|
Properties(
|
||||||
"annotators" -> "tokenize,ssplit"
|
"annotators" -> "tokenize,ssplit"
|
||||||
|
Binary file not shown.
@ -13,7 +13,7 @@ import docspell.files.TestFiles
|
|||||||
|
|
||||||
import munit._
|
import munit._
|
||||||
|
|
||||||
class DateFindSpec extends FunSuite {
|
class DateFindTest extends FunSuite {
|
||||||
|
|
||||||
test("find simple dates") {
|
test("find simple dates") {
|
||||||
val expect = Vector(
|
val expect = Vector(
|
||||||
@ -179,4 +179,29 @@ class DateFindSpec extends FunSuite {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("find spanish dates") {
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("México, Distrito Federal a 15 de Diciembre de 2011", Language.Spanish)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2011, 12, 15),
|
||||||
|
NerLabel("15 de Diciembre de 2011", NerTag.Date, 27, 50)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
println(DateFind.splitWords("2021-11-19", Language.Spanish).toList)
|
||||||
|
assertEquals(
|
||||||
|
DateFind
|
||||||
|
.findDates("2021-11-19", Language.Spanish)
|
||||||
|
.toVector,
|
||||||
|
Vector(
|
||||||
|
NerDateLabel(
|
||||||
|
LocalDate.of(2021, 11, 19),
|
||||||
|
NerLabel("2021-11-19", NerTag.Date, 0, 10)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
@ -30,7 +30,7 @@ object Language {
|
|||||||
override val allowsNLP = true
|
override val allowsNLP = true
|
||||||
}
|
}
|
||||||
object NLPLanguage {
|
object NLPLanguage {
|
||||||
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
|
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French, Spanish)
|
||||||
}
|
}
|
||||||
|
|
||||||
case object German extends NLPLanguage {
|
case object German extends NLPLanguage {
|
||||||
@ -53,11 +53,16 @@ object Language {
|
|||||||
val iso3 = "ita"
|
val iso3 = "ita"
|
||||||
}
|
}
|
||||||
|
|
||||||
case object Spanish extends Language {
|
case object Spanish extends NLPLanguage {
|
||||||
val iso2 = "es"
|
val iso2 = "es"
|
||||||
val iso3 = "spa"
|
val iso3 = "spa"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case object Hungarian extends Language {
|
||||||
|
val iso2 = "hu"
|
||||||
|
val iso3 = "hun"
|
||||||
|
}
|
||||||
|
|
||||||
case object Portuguese extends Language {
|
case object Portuguese extends Language {
|
||||||
val iso2 = "pt"
|
val iso2 = "pt"
|
||||||
val iso3 = "por"
|
val iso3 = "por"
|
||||||
@ -125,6 +130,7 @@ object Language {
|
|||||||
French,
|
French,
|
||||||
Italian,
|
Italian,
|
||||||
Spanish,
|
Spanish,
|
||||||
|
Hungarian,
|
||||||
Dutch,
|
Dutch,
|
||||||
Portuguese,
|
Portuguese,
|
||||||
Czech,
|
Czech,
|
||||||
|
@ -127,7 +127,13 @@ object SolrSetup {
|
|||||||
"Add hebrew content field",
|
"Add hebrew content field",
|
||||||
addContentField(Language.Hebrew)
|
addContentField(Language.Hebrew)
|
||||||
),
|
),
|
||||||
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field")
|
SolrMigration.reIndexAll(18, "Re-Index after adding hebrew content field"),
|
||||||
|
SolrMigration[F](
|
||||||
|
19,
|
||||||
|
"Add hungarian",
|
||||||
|
addContentField(Language.Hungarian)
|
||||||
|
),
|
||||||
|
SolrMigration.reIndexAll(20, "Re-Index after adding hungarian content field")
|
||||||
)
|
)
|
||||||
|
|
||||||
def addFolderField: F[Unit] =
|
def addFolderField: F[Unit] =
|
||||||
|
@ -18,11 +18,11 @@ import docspell.joex.Config
|
|||||||
import docspell.joex.analysis.RegexNerFile
|
import docspell.joex.analysis.RegexNerFile
|
||||||
import docspell.joex.scheduler.Context
|
import docspell.joex.scheduler.Context
|
||||||
import docspell.joex.scheduler.Task
|
import docspell.joex.scheduler.Task
|
||||||
|
import docspell.store.queries.QItem
|
||||||
import docspell.store.records.RAttachment
|
import docspell.store.records.RAttachment
|
||||||
import docspell.store.records.RAttachmentSource
|
import docspell.store.records.RAttachmentSource
|
||||||
import docspell.store.records.RCollective
|
import docspell.store.records.RCollective
|
||||||
import docspell.store.records.RItem
|
import docspell.store.records.RItem
|
||||||
import docspell.store.queries.QItem
|
|
||||||
|
|
||||||
object ReProcessItem {
|
object ReProcessItem {
|
||||||
type Args = ReProcessItemArgs
|
type Args = ReProcessItemArgs
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
CREATE TEMPORARY TABLE "temp_file_ids" (
|
||||||
|
cid varchar(254) not null,
|
||||||
|
file_id varchar(254) not null
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO "temp_file_ids" SELECT "cid", "file_id" FROM "classifier_model";
|
||||||
|
|
||||||
|
INSERT INTO "job"
|
||||||
|
SELECT md5(random()::text), 'learn-classifier', cid, '{"collective":"' || cid || '"}',
|
||||||
|
'new classifier', now(), 'docspell-system', 0, 'waiting', 0, 0
|
||||||
|
FROM "classifier_setting";
|
||||||
|
|
||||||
|
DELETE FROM "classifier_model";
|
||||||
|
|
||||||
|
DELETE FROM "filemeta"
|
||||||
|
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
|
||||||
|
|
||||||
|
DELETE FROM "filechunk"
|
||||||
|
WHERE "file_id" in (SELECT "file_id" FROM "temp_file_ids");
|
||||||
|
|
||||||
|
DROP TABLE "temp_file_ids";
|
@ -31,6 +31,7 @@ type Language
|
|||||||
| Latvian
|
| Latvian
|
||||||
| Japanese
|
| Japanese
|
||||||
| Hebrew
|
| Hebrew
|
||||||
|
| Hungarian
|
||||||
|
|
||||||
|
|
||||||
fromString : String -> Maybe Language
|
fromString : String -> Maybe Language
|
||||||
@ -86,6 +87,9 @@ fromString str =
|
|||||||
else if str == "heb" || str == "he" || str == "hebrew" then
|
else if str == "heb" || str == "he" || str == "hebrew" then
|
||||||
Just Hebrew
|
Just Hebrew
|
||||||
|
|
||||||
|
else if str == "hun" || str == "hu" || str == "hungarian" then
|
||||||
|
Just Hungarian
|
||||||
|
|
||||||
else
|
else
|
||||||
Nothing
|
Nothing
|
||||||
|
|
||||||
@ -144,6 +148,9 @@ toIso3 lang =
|
|||||||
Hebrew ->
|
Hebrew ->
|
||||||
"heb"
|
"heb"
|
||||||
|
|
||||||
|
Hungarian ->
|
||||||
|
"hun"
|
||||||
|
|
||||||
|
|
||||||
all : List Language
|
all : List Language
|
||||||
all =
|
all =
|
||||||
@ -164,4 +171,5 @@ all =
|
|||||||
, Latvian
|
, Latvian
|
||||||
, Japanese
|
, Japanese
|
||||||
, Hebrew
|
, Hebrew
|
||||||
|
, Hungarian
|
||||||
]
|
]
|
||||||
|
@ -67,6 +67,9 @@ gb lang =
|
|||||||
Hebrew ->
|
Hebrew ->
|
||||||
"Hebrew"
|
"Hebrew"
|
||||||
|
|
||||||
|
Hungarian ->
|
||||||
|
"Hungarian"
|
||||||
|
|
||||||
|
|
||||||
de : Language -> String
|
de : Language -> String
|
||||||
de lang =
|
de lang =
|
||||||
@ -121,3 +124,6 @@ de lang =
|
|||||||
|
|
||||||
Hebrew ->
|
Hebrew ->
|
||||||
"Hebräisch"
|
"Hebräisch"
|
||||||
|
|
||||||
|
Hungarian ->
|
||||||
|
"Ungarisch"
|
||||||
|
@ -914,7 +914,7 @@ in {
|
|||||||
|
|
||||||
The full and basic variants rely on pre-build language models
|
The full and basic variants rely on pre-build language models
|
||||||
that are available for only 3 lanugages at the moment: German,
|
that are available for only 3 lanugages at the moment: German,
|
||||||
English and French.
|
English, French and Spanish.
|
||||||
|
|
||||||
Memory usage varies greatly among the languages. German has
|
Memory usage varies greatly among the languages. German has
|
||||||
quite large models, that require about 1G heap. So joex should
|
quite large models, that require about 1G heap. So joex should
|
||||||
|
@ -40,7 +40,7 @@ object Dependencies {
|
|||||||
val ScalaJavaTimeVersion = "2.3.0"
|
val ScalaJavaTimeVersion = "2.3.0"
|
||||||
val ScodecBitsVersion = "1.1.29"
|
val ScodecBitsVersion = "1.1.29"
|
||||||
val Slf4jVersion = "1.7.32"
|
val Slf4jVersion = "1.7.32"
|
||||||
val StanfordNlpVersion = "4.2.2"
|
val StanfordNlpVersion = "4.3.2"
|
||||||
val TikaVersion = "2.1.0"
|
val TikaVersion = "2.1.0"
|
||||||
val YamuscaVersion = "0.8.1"
|
val YamuscaVersion = "0.8.1"
|
||||||
val SwaggerUIVersion = "4.1.0"
|
val SwaggerUIVersion = "4.1.0"
|
||||||
@ -185,18 +185,16 @@ object Dependencies {
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
val stanfordNlpModels = Seq(
|
val stanfordNlpModels = {
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
val artifact = "edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion
|
||||||
.classifier("models"),
|
Seq(
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
artifact.classifier("models"),
|
||||||
.classifier("models-german"),
|
artifact.classifier("models-german"),
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
artifact.classifier("models-french"),
|
||||||
.classifier("models-french"),
|
artifact.classifier("models-english"),
|
||||||
("edu.stanford.nlp" % "stanford-corenlp" % StanfordNlpVersion)
|
artifact.classifier("models-spanish")
|
||||||
.classifier(
|
)
|
||||||
"models-english"
|
}
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
val tika = Seq(
|
val tika = Seq(
|
||||||
"org.apache.tika" % "tika-core" % TikaVersion
|
"org.apache.tika" % "tika-core" % TikaVersion
|
||||||
|
@ -67,18 +67,29 @@ object NerModelsPlugin extends AutoPlugin {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private val nerModels = List(
|
private val nerModels = List(
|
||||||
"german.distsim.crf.ser.gz",
|
// English
|
||||||
"english.conll.4class.distsim.crf.ser.gz",
|
"english.conll.4class.distsim.crf.ser.gz",
|
||||||
|
"regexner_caseless.tab",
|
||||||
|
"regexner_cased.tab",
|
||||||
|
"english-left3words-distsim.tagger",
|
||||||
|
"english-left3words-distsim.tagger.props",
|
||||||
|
// German
|
||||||
|
"german.distsim.crf.ser.gz",
|
||||||
|
"german-mwt.tsv",
|
||||||
|
"german-ud.tagger",
|
||||||
|
"german-ud.tagger.props",
|
||||||
|
// French
|
||||||
"french-wikiner-4class.crf.ser.gz",
|
"french-wikiner-4class.crf.ser.gz",
|
||||||
"french-mwt-statistical.tsv",
|
"french-mwt-statistical.tsv",
|
||||||
"french-mwt.tagger",
|
"french-mwt.tagger",
|
||||||
"french-mwt.tsv",
|
"french-mwt.tsv",
|
||||||
"german-mwt.tsv",
|
|
||||||
"german-ud.tagger",
|
|
||||||
"german-ud.tagger.props",
|
|
||||||
"french-ud.tagger",
|
"french-ud.tagger",
|
||||||
"french-ud.tagger.props",
|
"french-ud.tagger.props",
|
||||||
"english-left3words-distsim.tagger",
|
// Spanish
|
||||||
"english-left3words-distsim.tagger.props"
|
"spanish.ancora.distsim.s512.crf.ser.gz",
|
||||||
|
"spanish-mwt.tsv",
|
||||||
|
"spanish-ud.tagger",
|
||||||
|
"kbp_regexner_number_sp.tag",
|
||||||
|
"kbp_regexner_mapping_sp.tag"
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -486,8 +486,8 @@ This setting defines which NLP mode to use. It defaults to `full`,
|
|||||||
which requires more memory for certain languages (with the advantage
|
which requires more memory for certain languages (with the advantage
|
||||||
of better results). Other values are `basic`, `regexonly` and
|
of better results). Other values are `basic`, `regexonly` and
|
||||||
`disabled`. The modes `full` and `basic` use pre-defined lanugage
|
`disabled`. The modes `full` and `basic` use pre-defined lanugage
|
||||||
models for procesing documents of languaes German, English and French.
|
models for procesing documents of languaes German, English, French and
|
||||||
These require some amount of memory (see below).
|
Spanish. These require some amount of memory (see below).
|
||||||
|
|
||||||
The mode `basic` is like the "light" variant to `full`. It doesn't use
|
The mode `basic` is like the "light" variant to `full`. It doesn't use
|
||||||
all NLP features, which makes memory consumption much lower, but comes
|
all NLP features, which makes memory consumption much lower, but comes
|
||||||
|
@ -8,10 +8,10 @@ mktoc = true
|
|||||||
+++
|
+++
|
||||||
|
|
||||||
When uploading a file, it is only saved to the database together with
|
When uploading a file, it is only saved to the database together with
|
||||||
the given meta information. The file is not visible in the ui yet.
|
the given meta information as a "job". The file is not visible in the
|
||||||
Then joex takes the next such file (or files in case you uploaded
|
ui yet. Then joex takes the next such job and starts processing it.
|
||||||
many) and starts processing it. When processing finished, the item and
|
When processing finished, the item and its files will show up in the
|
||||||
its files will show up in the ui.
|
ui.
|
||||||
|
|
||||||
If an error occurs during processing, the item will be created
|
If an error occurs during processing, the item will be created
|
||||||
anyways, so you can see it. Depending on the error, some information
|
anyways, so you can see it. Depending on the error, some information
|
||||||
@ -400,7 +400,7 @@ names etc. This also requires a statistical model, but this time for a
|
|||||||
whole language. These are also provided by [Stanford
|
whole language. These are also provided by [Stanford
|
||||||
NLP](https://nlp.stanford.edu/software/), but not for all languages.
|
NLP](https://nlp.stanford.edu/software/), but not for all languages.
|
||||||
So whether this can be used depends on the document language. Models
|
So whether this can be used depends on the document language. Models
|
||||||
exist for German, English and French currently.
|
exist for German, English, French and Spanish currently.
|
||||||
|
|
||||||
Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to
|
Then [Stanford NLP](https://nlp.stanford.edu/software/) also allows to
|
||||||
run custom rules against a text. This can be used as a fallback for
|
run custom rules against a text. This can be used as a fallback for
|
||||||
|
@ -147,11 +147,11 @@ experience. The features of text analysis strongly depend on the
|
|||||||
language. Docspell uses the [Stanford NLP
|
language. Docspell uses the [Stanford NLP
|
||||||
Library](https://nlp.stanford.edu/software/) for its great machine
|
Library](https://nlp.stanford.edu/software/) for its great machine
|
||||||
learning algorithms. Some of them, like certain NLP features, are only
|
learning algorithms. Some of them, like certain NLP features, are only
|
||||||
available for some languages – namely German, English and French. The
|
available for some languages – namely German, English, French and
|
||||||
reason is that the required statistical models are not available for
|
Spanish. The reason is that the required statistical models are not
|
||||||
other languages. However, docspell can still run other algorithms for
|
available for other languages. However, docspell can still run other
|
||||||
the other languages, like classification and custom rules based on the
|
algorithms for the other languages, like classification and custom
|
||||||
address book.
|
rules based on the address book.
|
||||||
|
|
||||||
More information about file processing and text analysis can be found
|
More information about file processing and text analysis can be found
|
||||||
[here](@/docs/joex/file-processing.md#text-analysis).
|
[here](@/docs/joex/file-processing.md#text-analysis).
|
||||||
|
Reference in New Issue
Block a user