mirror of
https://github.com/TheAnachronism/docspell.git
synced 2025-06-22 02:18:26 +00:00
Reorganize nlp pipeline and add nlp-unsupported language italian
Improves and reorganizes how nlp pipelines are setup. Now users can choose from many options, depending on their hardware and usage scenario. This is the base to use more languages without depending on what stanford-nlp supports. Support then is involves to text extraction and simple regex-ner processing.
This commit is contained in:
@ -1,5 +1,7 @@
|
||||
package docspell.common
|
||||
|
||||
import cats.data.NonEmptyList
|
||||
|
||||
import io.circe.{Decoder, Encoder}
|
||||
|
||||
sealed trait Language { self: Product =>
|
||||
@ -11,28 +13,41 @@ sealed trait Language { self: Product =>
|
||||
|
||||
def iso3: String
|
||||
|
||||
val allowsNLP: Boolean = false
|
||||
|
||||
private[common] def allNames =
|
||||
Set(name, iso3, iso2)
|
||||
}
|
||||
|
||||
object Language {
|
||||
sealed trait NLPLanguage extends Language with Product {
|
||||
override val allowsNLP = true
|
||||
}
|
||||
object NLPLanguage {
|
||||
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
|
||||
}
|
||||
|
||||
case object German extends Language {
|
||||
case object German extends NLPLanguage {
|
||||
val iso2 = "de"
|
||||
val iso3 = "deu"
|
||||
}
|
||||
|
||||
case object English extends Language {
|
||||
case object English extends NLPLanguage {
|
||||
val iso2 = "en"
|
||||
val iso3 = "eng"
|
||||
}
|
||||
|
||||
case object French extends Language {
|
||||
case object French extends NLPLanguage {
|
||||
val iso2 = "fr"
|
||||
val iso3 = "fra"
|
||||
}
|
||||
|
||||
val all: List[Language] = List(German, English, French)
|
||||
case object Italian extends Language {
|
||||
val iso2 = "it"
|
||||
val iso3 = "ita"
|
||||
}
|
||||
|
||||
val all: List[Language] = List(German, English, French, Italian)
|
||||
|
||||
def fromString(str: String): Either[String, Language] = {
|
||||
val lang = str.toLowerCase
|
||||
|
@ -6,16 +6,18 @@ sealed trait NlpMode { self: Product =>
|
||||
self.productPrefix
|
||||
}
|
||||
object NlpMode {
|
||||
case object Full extends NlpMode
|
||||
case object Basic extends NlpMode
|
||||
case object Disabled extends NlpMode
|
||||
case object Full extends NlpMode
|
||||
case object Basic extends NlpMode
|
||||
case object RegexOnly extends NlpMode
|
||||
case object Disabled extends NlpMode
|
||||
|
||||
def fromString(name: String): Either[String, NlpMode] =
|
||||
name.toLowerCase match {
|
||||
case "full" => Right(Full)
|
||||
case "basic" => Right(Basic)
|
||||
case "disabled" => Right(Disabled)
|
||||
case _ => Left(s"Unknown nlp-mode: $name")
|
||||
case "full" => Right(Full)
|
||||
case "basic" => Right(Basic)
|
||||
case "regexonly" => Right(RegexOnly)
|
||||
case "disabled" => Right(Disabled)
|
||||
case _ => Left(s"Unknown nlp-mode: $name")
|
||||
}
|
||||
|
||||
def unsafeFromString(name: String): NlpMode =
|
||||
|
@ -0,0 +1,20 @@
|
||||
package docspell.common.syntax
|
||||
|
||||
import java.nio.file.Path
|
||||
|
||||
trait FileSyntax {
|
||||
|
||||
implicit final class PathOps(p: Path) {
|
||||
|
||||
def absolutePath: Path =
|
||||
p.normalize().toAbsolutePath
|
||||
|
||||
def absolutePathAsString: String =
|
||||
absolutePath.toString
|
||||
|
||||
def /(next: String): Path =
|
||||
p.resolve(next)
|
||||
}
|
||||
}
|
||||
|
||||
object FileSyntax extends FileSyntax
|
@ -2,6 +2,11 @@ package docspell.common
|
||||
|
||||
package object syntax {
|
||||
|
||||
object all extends EitherSyntax with StreamSyntax with StringSyntax with LoggerSyntax
|
||||
object all
|
||||
extends EitherSyntax
|
||||
with StreamSyntax
|
||||
with StringSyntax
|
||||
with LoggerSyntax
|
||||
with FileSyntax
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user