Reorganize nlp pipeline and add nlp-unsupported language italian

Improves and reorganizes how nlp pipelines are setup. Now users can
choose from many options, depending on their hardware and usage
scenario.

This is the base to use more languages without depending on what
stanford-nlp supports. Support then is involves to text extraction and
simple regex-ner processing.
This commit is contained in:
Eike Kettner
2021-01-16 23:43:24 +01:00
parent a70e9ab614
commit f01646aeb5
29 changed files with 676 additions and 255 deletions

View File

@ -1,5 +1,7 @@
package docspell.common
import cats.data.NonEmptyList
import io.circe.{Decoder, Encoder}
sealed trait Language { self: Product =>
@ -11,28 +13,41 @@ sealed trait Language { self: Product =>
def iso3: String
val allowsNLP: Boolean = false
private[common] def allNames =
Set(name, iso3, iso2)
}
object Language {
sealed trait NLPLanguage extends Language with Product {
override val allowsNLP = true
}
object NLPLanguage {
val all: NonEmptyList[NLPLanguage] = NonEmptyList.of(German, English, French)
}
case object German extends Language {
case object German extends NLPLanguage {
val iso2 = "de"
val iso3 = "deu"
}
case object English extends Language {
case object English extends NLPLanguage {
val iso2 = "en"
val iso3 = "eng"
}
case object French extends Language {
case object French extends NLPLanguage {
val iso2 = "fr"
val iso3 = "fra"
}
val all: List[Language] = List(German, English, French)
case object Italian extends Language {
val iso2 = "it"
val iso3 = "ita"
}
val all: List[Language] = List(German, English, French, Italian)
def fromString(str: String): Either[String, Language] = {
val lang = str.toLowerCase

View File

@ -6,16 +6,18 @@ sealed trait NlpMode { self: Product =>
self.productPrefix
}
object NlpMode {
case object Full extends NlpMode
case object Basic extends NlpMode
case object Disabled extends NlpMode
case object Full extends NlpMode
case object Basic extends NlpMode
case object RegexOnly extends NlpMode
case object Disabled extends NlpMode
def fromString(name: String): Either[String, NlpMode] =
name.toLowerCase match {
case "full" => Right(Full)
case "basic" => Right(Basic)
case "disabled" => Right(Disabled)
case _ => Left(s"Unknown nlp-mode: $name")
case "full" => Right(Full)
case "basic" => Right(Basic)
case "regexonly" => Right(RegexOnly)
case "disabled" => Right(Disabled)
case _ => Left(s"Unknown nlp-mode: $name")
}
def unsafeFromString(name: String): NlpMode =

View File

@ -0,0 +1,20 @@
package docspell.common.syntax
import java.nio.file.Path
trait FileSyntax {
implicit final class PathOps(p: Path) {
def absolutePath: Path =
p.normalize().toAbsolutePath
def absolutePathAsString: String =
absolutePath.toString
def /(next: String): Path =
p.resolve(next)
}
}
object FileSyntax extends FileSyntax

View File

@ -2,6 +2,11 @@ package docspell.common
package object syntax {
object all extends EitherSyntax with StreamSyntax with StringSyntax with LoggerSyntax
object all
extends EitherSyntax
with StreamSyntax
with StringSyntax
with LoggerSyntax
with FileSyntax
}