package docspell.build

import sbt.{Def, _}
import sbt.Keys._

/** Take some files from dependencies and put them into the resources of a local sbt
  * project.
  *
  * The reason is that the stanford ner model files are very very large: the jar file for
  * the english models is about 1G and the jar file for the german models is about 170M.
  * But I only need one file that is about 60M from each jar. So just for the sake to save
  * 1GB file size when packaging docspell, this ugly plugin exists….
  *
  * The jar files to filter must be added to the libraryDependencies in config
  * "NerModels".
  */
object NerModelsPlugin extends AutoPlugin {

  object autoImport {
    val NerModels = config("NerModels")

    val nerModelsFilter    = settingKey[String => Boolean]("Which files to keep.")
    val nerModelsRunFilter = taskKey[Seq[File]]("Extract files from libraryDependencies")

  }

  import autoImport._

  def nerModelSettings: Seq[Setting[_]] =
    Seq(
      nerModelsFilter := (_ => false),
      nerModelsRunFilter := {
        filterArtifacts(
          streams.value.log,
          Classpaths.managedJars(NerModels, Set("jar", "zip"), update.value),
          nerModelsFilter.value,
          (Compile / resourceManaged).value
        )
      },
      Compile / resourceGenerators += nerModelsRunFilter.taskValue
    )

  def nerClassifierSettings: Seq[Setting[_]] =
    Seq(
      libraryDependencies ++= Dependencies.stanfordNlpModels.map(_ % NerModels),
      nerModelsFilter := { name =>
        nerModels.exists(name.endsWith)
      }
    )

  override def projectConfigurations: Seq[Configuration] =
    Seq(NerModels)

  override def projectSettings: Seq[Setting[_]] =
    nerModelSettings

  def filterArtifacts(
      logger: Logger,
      cp: Classpath,
      nameFilter: NameFilter,
      out: File
  ): Seq[File] = {
    logger.info(s"NerModels: Filtering artifacts...")
    cp.files.flatMap { f =>
      IO.unzip(f, out, nameFilter)
    }
  }

  private val nerModels = List(
    "german.distsim.crf.ser.gz",
    "english.conll.4class.distsim.crf.ser.gz",
    "french-wikiner-4class.crf.ser.gz",
    "french-mwt-statistical.tsv",
    "french-mwt.tagger",
    "french-mwt.tsv",
    "german-mwt.tsv",
    "german-ud.tagger",
    "german-ud.tagger.props",
    "french-ud.tagger",
    "french-ud.tagger.props",
    "english-left3words-distsim.tagger",
    "english-left3words-distsim.tagger.props"
  )
}