mirror of
https://github.com/TheAnachronism/docspell.git
synced 2024-11-13 02:31:10 +00:00
501c6f2988
There are models for Spanish, that have been added now. Also the Hungarian language has been added to the list of supported languages (for tesseract mainly, no nlp models)
96 lines
2.6 KiB
Scala
96 lines
2.6 KiB
Scala
package docspell.build
|
|
|
|
import sbt.{Def, _}
|
|
import sbt.Keys._
|
|
|
|
/** Take some files from dependencies and put them into the resources of a local sbt
|
|
* project.
|
|
*
|
|
* The reason is that the stanford ner model files are very very large: the jar file for
|
|
* the english models is about 1G and the jar file for the german models is about 170M.
|
|
* But I only need one file that is about 60M from each jar. So just for the sake to save
|
|
* 1GB file size when packaging docspell, this ugly plugin exists….
|
|
*
|
|
* The jar files to filter must be added to the libraryDependencies in config
|
|
* "NerModels".
|
|
*/
|
|
object NerModelsPlugin extends AutoPlugin {
|
|
|
|
object autoImport {
|
|
val NerModels = config("NerModels")
|
|
|
|
val nerModelsFilter = settingKey[String => Boolean]("Which files to keep.")
|
|
val nerModelsRunFilter = taskKey[Seq[File]]("Extract files from libraryDependencies")
|
|
|
|
}
|
|
|
|
import autoImport._
|
|
|
|
def nerModelSettings: Seq[Setting[_]] =
|
|
Seq(
|
|
nerModelsFilter := (_ => false),
|
|
nerModelsRunFilter := {
|
|
filterArtifacts(
|
|
streams.value.log,
|
|
Classpaths.managedJars(NerModels, Set("jar", "zip"), update.value),
|
|
nerModelsFilter.value,
|
|
(Compile / resourceManaged).value
|
|
)
|
|
},
|
|
Compile / resourceGenerators += nerModelsRunFilter.taskValue
|
|
)
|
|
|
|
def nerClassifierSettings: Seq[Setting[_]] =
|
|
Seq(
|
|
libraryDependencies ++= Dependencies.stanfordNlpModels.map(_ % NerModels),
|
|
nerModelsFilter := { name =>
|
|
nerModels.exists(name.endsWith)
|
|
}
|
|
)
|
|
|
|
override def projectConfigurations: Seq[Configuration] =
|
|
Seq(NerModels)
|
|
|
|
override def projectSettings: Seq[Setting[_]] =
|
|
nerModelSettings
|
|
|
|
def filterArtifacts(
|
|
logger: Logger,
|
|
cp: Classpath,
|
|
nameFilter: NameFilter,
|
|
out: File
|
|
): Seq[File] = {
|
|
logger.info(s"NerModels: Filtering artifacts...")
|
|
cp.files.flatMap { f =>
|
|
IO.unzip(f, out, nameFilter)
|
|
}
|
|
}
|
|
|
|
private val nerModels = List(
|
|
// English
|
|
"english.conll.4class.distsim.crf.ser.gz",
|
|
"regexner_caseless.tab",
|
|
"regexner_cased.tab",
|
|
"english-left3words-distsim.tagger",
|
|
"english-left3words-distsim.tagger.props",
|
|
// German
|
|
"german.distsim.crf.ser.gz",
|
|
"german-mwt.tsv",
|
|
"german-ud.tagger",
|
|
"german-ud.tagger.props",
|
|
// French
|
|
"french-wikiner-4class.crf.ser.gz",
|
|
"french-mwt-statistical.tsv",
|
|
"french-mwt.tagger",
|
|
"french-mwt.tsv",
|
|
"french-ud.tagger",
|
|
"french-ud.tagger.props",
|
|
// Spanish
|
|
"spanish.ancora.distsim.s512.crf.ser.gz",
|
|
"spanish-mwt.tsv",
|
|
"spanish-ud.tagger",
|
|
"kbp_regexner_number_sp.tag",
|
|
"kbp_regexner_mapping_sp.tag"
|
|
)
|
|
}
|