2019-07-22 22:53:30 +00:00
|
|
|
package docspell.build
|
|
|
|
|
|
|
|
import sbt.{Def, _}
|
|
|
|
import sbt.Keys._
|
|
|
|
|
2021-08-19 06:50:30 +00:00
|
|
|
/** Take some files from dependencies and put them into the resources of a local sbt
|
|
|
|
* project.
|
2019-07-22 22:53:30 +00:00
|
|
|
*
|
2021-08-19 06:50:30 +00:00
|
|
|
* The reason is that the stanford ner model files are very very large: the jar file for
|
|
|
|
* the english models is about 1G and the jar file for the german models is about 170M.
|
|
|
|
* But I only need one file that is about 60M from each jar. So just for the sake to save
|
|
|
|
* 1GB file size when packaging docspell, this ugly plugin exists….
|
2019-07-22 22:53:30 +00:00
|
|
|
*
|
2021-08-19 06:50:30 +00:00
|
|
|
* The jar files to filter must be added to the libraryDependencies in config
|
|
|
|
* "NerModels".
|
2019-07-22 22:53:30 +00:00
|
|
|
*/
|
|
|
|
object NerModelsPlugin extends AutoPlugin {
|
|
|
|
|
|
|
|
object autoImport {
|
|
|
|
val NerModels = config("NerModels")
|
|
|
|
|
2021-09-22 15:23:24 +00:00
|
|
|
val nerModelsFilter = settingKey[String => Boolean]("Which files to keep.")
|
2019-07-22 22:53:30 +00:00
|
|
|
val nerModelsRunFilter = taskKey[Seq[File]]("Extract files from libraryDependencies")
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
import autoImport._
|
|
|
|
|
2020-06-22 22:28:04 +00:00
|
|
|
def nerModelSettings: Seq[Setting[_]] =
|
|
|
|
Seq(
|
|
|
|
nerModelsFilter := (_ => false),
|
|
|
|
nerModelsRunFilter := {
|
|
|
|
filterArtifacts(
|
|
|
|
streams.value.log,
|
|
|
|
Classpaths.managedJars(NerModels, Set("jar", "zip"), update.value),
|
|
|
|
nerModelsFilter.value,
|
|
|
|
(Compile / resourceManaged).value
|
|
|
|
)
|
|
|
|
},
|
|
|
|
Compile / resourceGenerators += nerModelsRunFilter.taskValue
|
|
|
|
)
|
2019-07-22 22:53:30 +00:00
|
|
|
|
2020-06-22 22:28:04 +00:00
|
|
|
def nerClassifierSettings: Seq[Setting[_]] =
|
|
|
|
Seq(
|
|
|
|
libraryDependencies ++= Dependencies.stanfordNlpModels.map(_ % NerModels),
|
|
|
|
nerModelsFilter := { name =>
|
|
|
|
nerModels.exists(name.endsWith)
|
|
|
|
}
|
|
|
|
)
|
2019-07-22 22:53:30 +00:00
|
|
|
|
|
|
|
override def projectConfigurations: Seq[Configuration] =
|
|
|
|
Seq(NerModels)
|
|
|
|
|
|
|
|
override def projectSettings: Seq[Setting[_]] =
|
|
|
|
nerModelSettings
|
|
|
|
|
2020-06-22 22:28:04 +00:00
|
|
|
def filterArtifacts(
|
|
|
|
logger: Logger,
|
|
|
|
cp: Classpath,
|
|
|
|
nameFilter: NameFilter,
|
|
|
|
out: File
|
|
|
|
): Seq[File] = {
|
2019-07-22 22:53:30 +00:00
|
|
|
logger.info(s"NerModels: Filtering artifacts...")
|
2020-06-22 22:28:04 +00:00
|
|
|
cp.files.flatMap { f =>
|
2019-07-22 22:53:30 +00:00
|
|
|
IO.unzip(f, out, nameFilter)
|
2020-06-22 22:28:04 +00:00
|
|
|
}
|
2019-07-22 22:53:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private val nerModels = List(
|
2021-11-20 13:31:39 +00:00
|
|
|
// English
|
2020-04-21 21:33:15 +00:00
|
|
|
"english.conll.4class.distsim.crf.ser.gz",
|
2021-11-20 13:31:39 +00:00
|
|
|
"regexner_caseless.tab",
|
|
|
|
"regexner_cased.tab",
|
|
|
|
"english-left3words-distsim.tagger",
|
|
|
|
"english-left3words-distsim.tagger.props",
|
|
|
|
// German
|
|
|
|
"german.distsim.crf.ser.gz",
|
|
|
|
"german-mwt.tsv",
|
|
|
|
"german-ud.tagger",
|
|
|
|
"german-ud.tagger.props",
|
|
|
|
// French
|
2020-04-21 21:33:15 +00:00
|
|
|
"french-wikiner-4class.crf.ser.gz",
|
|
|
|
"french-mwt-statistical.tsv",
|
|
|
|
"french-mwt.tagger",
|
|
|
|
"french-mwt.tsv",
|
|
|
|
"french-ud.tagger",
|
|
|
|
"french-ud.tagger.props",
|
2021-11-20 13:31:39 +00:00
|
|
|
// Spanish
|
|
|
|
"spanish.ancora.distsim.s512.crf.ser.gz",
|
|
|
|
"spanish-mwt.tsv",
|
|
|
|
"spanish-ud.tagger",
|
|
|
|
"kbp_regexner_number_sp.tag",
|
|
|
|
"kbp_regexner_mapping_sp.tag"
|
2019-07-22 22:53:30 +00:00
|
|
|
)
|
|
|
|
}
|