diff --git a/.scalafmt.conf b/.scalafmt.conf index 0148baae..ab23da00 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,6 +1,6 @@ version = "2.4.2" -align = most +align = more #align.arrowEnumeratorGenerator = true maxColumn = 100 diff --git a/build.sbt b/build.sbt index 84c52fe6..a7f836c2 100644 --- a/build.sbt +++ b/build.sbt @@ -143,6 +143,42 @@ val common = project.in(file("modules/common")). Dependencies.pureconfig.map(_ % "optional") ) +// Some example files for testing +// https://file-examples.com/index.php/sample-documents-download/sample-doc-download/ +val files = project.in(file("modules/files")). + disablePlugins(RevolverPlugin). + settings(sharedSettings). + settings(testSettings). + settings( + name := "docspell-files", + libraryDependencies ++= + Dependencies.tika, + Test / sourceGenerators += Def.task { + val base = (Test/resourceDirectory).value + val files = (base ** (_.isFile)) pair sbt.io.Path.relativeTo(base) + val lines = files.toList.map(_._2).map(s => { + val ident = s.replaceAll("[^a-zA-Z0-9_]+", "_") + ident -> s"""val $ident = createUrl("${s}")""" + }) + val content = s"""package docspell.files + +object ExampleFiles extends ExampleFilesSupport { + +${lines.map(_._2).mkString("\n")} + +val all = List( +${lines.map(_._1).mkString(",\n")} +) + +} +""" + val target = (Test/sourceManaged).value/"scala"/"ExampleFiles.scala" + IO.createDirectory(target.getParentFile) + IO.write(target, content) + Seq(target) + }.taskValue + ).dependsOn(common) + val store = project.in(file("modules/store")). disablePlugins(RevolverPlugin). settings(sharedSettings). @@ -160,19 +196,44 @@ val store = project.in(file("modules/store")). Dependencies.emil ).dependsOn(common) -val text = project.in(file("modules/text")). +val extract = project.in(file("modules/extract")). + disablePlugins(RevolverPlugin). + settings(sharedSettings). + settings(testSettings). + settings( + name := "docspell-extract", + libraryDependencies ++= + Dependencies.fs2 ++ + Dependencies.twelvemonkeys ++ + Dependencies.pdfbox ++ + Dependencies.poi ++ + Dependencies.commonsIO ++ + Dependencies.julOverSlf4j + ).dependsOn(common, files % "compile->compile;test->test") + +val convert = project.in(file("modules/convert")). + disablePlugins(RevolverPlugin). + settings(sharedSettings). + settings(testSettings). + settings( + name := "docspell-convert", + libraryDependencies ++= + Dependencies.flexmark ++ + Dependencies.twelvemonkeys + ).dependsOn(common, files % "compile->compile;test->test") + +val analysis = project.in(file("modules/analysis")). disablePlugins(RevolverPlugin). enablePlugins(NerModelsPlugin). settings(sharedSettings). settings(testSettings). settings(NerModelsPlugin.nerClassifierSettings). settings( - name := "docspell-text", + name := "docspell-analysis", libraryDependencies ++= Dependencies.fs2 ++ - Dependencies.tika ++ Dependencies.stanfordNlpCore - ).dependsOn(common) + ).dependsOn(common, files % "test->test") val restapi = project.in(file("modules/restapi")). disablePlugins(RevolverPlugin). @@ -226,7 +287,7 @@ val joex = project.in(file("modules/joex")). addCompilerPlugin(Dependencies.betterMonadicFor), buildInfoPackage := "docspell.joex", reStart/javaOptions ++= Seq(s"-Dconfig.file=${(LocalRootProject/baseDirectory).value/"local"/"dev.conf"}") - ).dependsOn(store, text, joexapi, restapi) + ).dependsOn(store, extract, convert, analysis, joexapi, restapi) val backend = project.in(file("modules/backend")). disablePlugins(RevolverPlugin). @@ -303,11 +364,11 @@ val microsite = project.in(file("modules/microsite")). skip in publish := true, micrositeFooterText := Some( """ - |

© 2019 Docspell, v{{site.version}}

+ |

© 2020 Docspell, v{{site.version}}

|""".stripMargin ), micrositeName := "Docspell", - micrositeDescription := "A (PDF) Document Organizer", + micrositeDescription := "Auto-tagging Document Organizer", micrositeDocumentationUrl := "/docspell/getit.html", micrositeDocumentationLabelDescription := "Quickstart", micrositeFavicons := Seq(microsites.MicrositeFavicon("favicon.png", "96x96")), @@ -356,7 +417,10 @@ val root = project.in(file(".")). name := "docspell-root" ). aggregate(common - , text + , extract + , convert + , analysis + , files , store , joexapi , joex diff --git a/modules/text/src/main/scala/docspell/text/contact/Contact.scala b/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala similarity index 92% rename from modules/text/src/main/scala/docspell/text/contact/Contact.scala rename to modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala index f1e5b480..bd3c5823 100644 --- a/modules/text/src/main/scala/docspell/text/contact/Contact.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Contact.scala @@ -1,9 +1,10 @@ -package docspell.text.contact +package docspell.analysis.contact import fs2.Stream import cats.implicits._ -import docspell.common.{Ident, LenientUri, NerLabel, NerTag} -import docspell.text.split.TextSplitter + +import docspell.common._ +import docspell.analysis.split._ object Contact { private[this] val protocols = Set("ftp", "http", "https") diff --git a/modules/text/src/main/scala/docspell/text/contact/Domain.scala b/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala similarity index 97% rename from modules/text/src/main/scala/docspell/text/contact/Domain.scala rename to modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala index b9973392..3fc16ebb 100644 --- a/modules/text/src/main/scala/docspell/text/contact/Domain.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Domain.scala @@ -1,4 +1,4 @@ -package docspell.text.contact +package docspell.analysis.contact import cats.data.NonEmptyList import docspell.common.LenientUri diff --git a/modules/text/src/main/scala/docspell/text/contact/Tld.scala b/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala similarity index 93% rename from modules/text/src/main/scala/docspell/text/contact/Tld.scala rename to modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala index af7cae07..3f8ba6ae 100644 --- a/modules/text/src/main/scala/docspell/text/contact/Tld.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/contact/Tld.scala @@ -1,6 +1,6 @@ -package docspell.text.contact +package docspell.analysis.contact -private[text] object Tld { +private[analysis] object Tld { def findTld(str: String): Option[String] = known.find(str.endsWith) diff --git a/modules/text/src/main/scala/docspell/text/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala similarity index 93% rename from modules/text/src/main/scala/docspell/text/date/DateFind.scala rename to modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 79f956ec..eb21f0a8 100644 --- a/modules/text/src/main/scala/docspell/text/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -1,10 +1,10 @@ -package docspell.text.date +package docspell.analysis.date -import fs2._ import java.time.LocalDate -import docspell.common.{Language, NerDateLabel, NerLabel, NerTag} -import docspell.text.split.{TextSplitter, Word} +import fs2.{Pure, Stream} +import docspell.common._ +import docspell.analysis.split._ import scala.util.Try @@ -21,7 +21,7 @@ object DateFind { .map(sd => NerDateLabel( sd.toLocalDate, - NerLabel(text.substring(q(0).begin, q(2).end), NerTag.Date, q(0).begin, q(1).end) + NerLabel(text.substring(q.head.begin, q(2).end), NerTag.Date, q.head.begin, q(1).end) ) ) ) diff --git a/modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala similarity index 95% rename from modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala rename to modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala index 084d9dc4..e8a3329d 100644 --- a/modules/text/src/main/scala/docspell/text/nlp/StanfordNerClassifier.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/nlp/StanfordNerClassifier.scala @@ -1,17 +1,17 @@ -package docspell.text.nlp +package docspell.analysis.nlp +import java.net.URL import java.util.zip.GZIPInputStream -import docspell.common.{Language, NerLabel, NerTag} import edu.stanford.nlp.ie.AbstractSequenceClassifier import edu.stanford.nlp.ie.crf.CRFClassifier import edu.stanford.nlp.ling.{CoreAnnotations, CoreLabel} +import org.log4s.getLogger -import scala.jdk.CollectionConverters._ -import org.log4s._ +import docspell.common._ -import java.net.URL import scala.util.Using +import scala.jdk.CollectionConverters._ object StanfordNerClassifier { private[this] val logger = getLogger diff --git a/modules/text/src/main/scala/docspell/text/split/TextSplitter.scala b/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala similarity index 96% rename from modules/text/src/main/scala/docspell/text/split/TextSplitter.scala rename to modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala index 7a7cd292..d4892dac 100644 --- a/modules/text/src/main/scala/docspell/text/split/TextSplitter.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/split/TextSplitter.scala @@ -1,4 +1,4 @@ -package docspell.text.split +package docspell.analysis.split import fs2.Stream diff --git a/modules/text/src/main/scala/docspell/text/split/Word.scala b/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala similarity index 96% rename from modules/text/src/main/scala/docspell/text/split/Word.scala rename to modules/analysis/src/main/scala/docspell/analysis/split/Word.scala index 88f0c3c0..f68f0af7 100644 --- a/modules/text/src/main/scala/docspell/text/split/Word.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/split/Word.scala @@ -1,4 +1,4 @@ -package docspell.text.split +package docspell.analysis.split case class Word(value: String, begin: Int, end: Int) { def isEmpty: Boolean = value.isEmpty diff --git a/modules/text/src/test/scala/docspell/text/contact/ContactAnnotateSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala similarity index 97% rename from modules/text/src/test/scala/docspell/text/contact/ContactAnnotateSpec.scala rename to modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala index 721d2c35..aaff9203 100644 --- a/modules/text/src/test/scala/docspell/text/contact/ContactAnnotateSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/contact/ContactAnnotateSpec.scala @@ -1,4 +1,4 @@ -package docspell.text.contact +package docspell.analysis.contact import docspell.common.{NerLabel, NerTag} import minitest.SimpleTestSuite diff --git a/modules/text/src/test/scala/docspell/text/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala similarity index 75% rename from modules/text/src/test/scala/docspell/text/date/DateFindSpec.scala rename to modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index a974fa22..5ffc853f 100644 --- a/modules/text/src/test/scala/docspell/text/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -1,8 +1,8 @@ -package docspell.text.date +package docspell.analysis.date +import docspell.files.TestFiles +import minitest.SimpleTestSuite import docspell.common.Language -import docspell.text.TestFiles -import minitest._ object DateFindSpec extends SimpleTestSuite { diff --git a/modules/text/src/test/scala/docspell/text/nlp/TextAnalyserSuite.scala b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala similarity index 52% rename from modules/text/src/test/scala/docspell/text/nlp/TextAnalyserSuite.scala rename to modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala index 309c241b..cb932cf4 100644 --- a/modules/text/src/test/scala/docspell/text/nlp/TextAnalyserSuite.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/nlp/TextAnalyserSuite.scala @@ -1,8 +1,8 @@ -package docspell.text.nlp +package docspell.analysis.nlp -import docspell.common.{Language, NerLabel, NerTag} -import docspell.text.TestFiles import minitest.SimpleTestSuite +import docspell.files.TestFiles +import docspell.common._ object TextAnalyserSuite extends SimpleTestSuite { @@ -12,25 +12,23 @@ object TextAnalyserSuite extends SimpleTestSuite { NerLabel("Derek", NerTag.Person, 0, 5), NerLabel("Jeter", NerTag.Person, 6, 11), NerLabel("Treesville", NerTag.Person, 27, 37), - NerLabel("Derek", NerTag.Person, 69, 74), - NerLabel("Jeter", NerTag.Person, 75, 80), - NerLabel("Treesville", NerTag.Location, 96, 106), - NerLabel("M.", NerTag.Person, 142, 144), - NerLabel("Leat", NerTag.Person, 145, 149), - NerLabel("Syrup", NerTag.Organization, 160, 165), - NerLabel("Production", NerTag.Organization, 166, 176), - NerLabel("Old", NerTag.Organization, 177, 180), - NerLabel("Sticky", NerTag.Organization, 181, 187), - NerLabel("Pancake", NerTag.Organization, 188, 195), - NerLabel("Company", NerTag.Organization, 196, 203), - NerLabel("Maple", NerTag.Location, 208, 213), - NerLabel("Lane", NerTag.Location, 214, 218), - NerLabel("Forest", NerTag.Location, 220, 226), - NerLabel("Hemptown", NerTag.Location, 241, 249), - NerLabel("Little", NerTag.Organization, 349, 355), - NerLabel("League", NerTag.Organization, 356, 362), - NerLabel("Derek", NerTag.Person, 1119, 1124), - NerLabel("Jeter", NerTag.Person, 1125, 1130) + NerLabel("Derek", NerTag.Person, 68, 73), + NerLabel("Jeter", NerTag.Person, 74, 79), + NerLabel("Treesville", NerTag.Location, 95, 105), + NerLabel("Syrup", NerTag.Organization, 159, 164), + NerLabel("Production", NerTag.Organization, 165, 175), + NerLabel("Old", NerTag.Organization, 176, 179), + NerLabel("Sticky", NerTag.Organization, 180, 186), + NerLabel("Pancake", NerTag.Organization, 187, 194), + NerLabel("Company", NerTag.Organization, 195, 202), + NerLabel("Maple", NerTag.Location, 207, 212), + NerLabel("Lane", NerTag.Location, 213, 217), + NerLabel("Forest", NerTag.Location, 219, 225), + NerLabel("Hemptown", NerTag.Location, 239, 247), + NerLabel("Little", NerTag.Organization, 347, 353), + NerLabel("League", NerTag.Organization, 354, 360), + NerLabel("Derek", NerTag.Person, 1117, 1122), + NerLabel("Jeter", NerTag.Person, 1123, 1128) ) assertEquals(labels, expect) } diff --git a/modules/text/src/test/scala/docspell/text/split/TestSplitterSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala similarity index 91% rename from modules/text/src/test/scala/docspell/text/split/TestSplitterSpec.scala rename to modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala index 13e91a5d..4c17741d 100644 --- a/modules/text/src/test/scala/docspell/text/split/TestSplitterSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/split/TestSplitterSpec.scala @@ -1,6 +1,6 @@ -package docspell.text.split +package docspell.analysis.split -import minitest._ +import minitest.SimpleTestSuite object TestSplitterSpec extends SimpleTestSuite { diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala index aec36e4f..d9613312 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala @@ -8,11 +8,10 @@ import doobie._ import doobie.implicits._ import docspell.store.{AddResult, Store} import docspell.store.queries.{QAttachment, QItem} -import OItem.{AttachmentData, ItemData, ListItem, Query} +import OItem.{AttachmentData, AttachmentSourceData, ItemData, ListItem, Query} import bitpeace.{FileMeta, RangeDef} import docspell.common.{Direction, Ident, ItemState, MetaProposalList, Timestamp} -import docspell.store.records.{RAttachment, RAttachmentMeta, RItem, RTagItem} -import docspell.store.records.RSource +import docspell.store.records.{RAttachment, RAttachmentMeta, RAttachmentSource, RItem, RSource, RTagItem} trait OItem[F[_]] { @@ -22,6 +21,8 @@ trait OItem[F[_]] { def findAttachment(id: Ident, collective: Ident): F[Option[AttachmentData[F]]] + def findAttachmentSource(id: Ident, collective: Ident): F[Option[AttachmentSourceData[F]]] + def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] def setDirection(item: Ident, direction: Direction, collective: Ident): F[AddResult] @@ -67,7 +68,23 @@ object OItem { type ItemData = QItem.ItemData val ItemData = QItem.ItemData + trait BinaryData[F[_]] { + def data: Stream[F, Byte] + def name: Option[String] + def meta: FileMeta + def fileId: Ident + } case class AttachmentData[F[_]](ra: RAttachment, meta: FileMeta, data: Stream[F, Byte]) + extends BinaryData[F] { + val name = ra.name + val fileId = ra.fileId + } + + case class AttachmentSourceData[F[_]](rs: RAttachmentSource, meta: FileMeta, data: Stream[F, Byte]) + extends BinaryData[F] { + val name = rs.name + val fileId = rs.fileId + } def apply[F[_]: Effect](store: Store[F]): Resource[F, OItem[F]] = Resource.pure[F, OItem[F]](new OItem[F] { @@ -83,24 +100,41 @@ object OItem { .transact(RAttachment.findByIdAndCollective(id, collective)) .flatMap({ case Some(ra) => - store.bitpeace - .get(ra.fileId.id) - .unNoneTerminate - .compile - .last - .map( - _.map(m => - AttachmentData[F]( - ra, - m, - store.bitpeace.fetchData2(RangeDef.all)(Stream.emit(m)) - ) - ) + makeBinaryData(ra.fileId) { m => + AttachmentData[F]( + ra, + m, + store.bitpeace.fetchData2(RangeDef.all)(Stream.emit(m)) ) + } + case None => (None: Option[AttachmentData[F]]).pure[F] }) + def findAttachmentSource(id: Ident, collective: Ident): F[Option[AttachmentSourceData[F]]] = + store + .transact(RAttachmentSource.findByIdAndCollective(id, collective)) + .flatMap({ + case Some(ra) => + makeBinaryData(ra.fileId) { m => + AttachmentSourceData[F]( + ra, + m, + store.bitpeace.fetchData2(RangeDef.all)(Stream.emit(m)) + ) + } + + case None => + (None: Option[AttachmentSourceData[F]]).pure[F] + }) + + private def makeBinaryData[A](fileId: Ident)(f: FileMeta => A): F[Option[A]] = + store.bitpeace + .get(fileId.id).unNoneTerminate.compile.last.map( + _.map(m => f(m)) + ) + def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[AddResult] = { val db = for { cid <- RItem.getCollective(item) diff --git a/modules/common/src/main/scala/docspell/common/DataType.scala b/modules/common/src/main/scala/docspell/common/DataType.scala new file mode 100644 index 00000000..69d2a883 --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/DataType.scala @@ -0,0 +1,19 @@ +package docspell.common + +sealed trait DataType { + +} + +object DataType { + + case class Exact(mime: MimeType) extends DataType + + case class Hint(hint: MimeTypeHint) extends DataType + + + def apply(mt: MimeType): DataType = + Exact(mt) + + def filename(name: String): DataType = + Hint(MimeTypeHint.filename(name)) +} diff --git a/modules/text/src/main/scala/docspell/text/ocr/File.scala b/modules/common/src/main/scala/docspell/common/File.scala similarity index 64% rename from modules/text/src/main/scala/docspell/text/ocr/File.scala rename to modules/common/src/main/scala/docspell/common/File.scala index 31d05d3b..f85845c7 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/File.scala +++ b/modules/common/src/main/scala/docspell/common/File.scala @@ -1,14 +1,14 @@ -package docspell.text.ocr +package docspell.common -import cats.implicits._ -import scala.jdk.CollectionConverters._ import java.io.IOException import java.nio.file.attribute.BasicFileAttributes import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor} import java.util.concurrent.atomic.AtomicInteger -import cats.effect.Sync +import scala.jdk.CollectionConverters._ import fs2.Stream +import cats.implicits._ +import cats.effect._ object File { @@ -18,6 +18,9 @@ object File { def mkTempDir[F[_]: Sync](parent: Path, prefix: String): F[Path] = mkDir(parent).map(p => Files.createTempDirectory(p, prefix)) + def mkTempFile[F[_]: Sync](parent: Path, prefix: String, suffix: Option[String] = None): F[Path] = + mkDir(parent).map(p => Files.createTempFile(p, prefix, suffix.orNull)) + def deleteDirectory[F[_]: Sync](dir: Path): F[Int] = Sync[F].delay { val count = new AtomicInteger(0) Files.walkFileTree( @@ -40,6 +43,12 @@ object File { count.get } + def exists[F[_]: Sync](file: Path): F[Boolean] = + Sync[F].delay(Files.exists(file)) + + def existsNonEmpty[F[_]: Sync](file: Path, minSize: Long = 0): F[Boolean] = + Sync[F].delay(Files.exists(file) && Files.size(file) > minSize) + def deleteFile[F[_]: Sync](file: Path): F[Unit] = Sync[F].delay(Files.deleteIfExists(file)).map(_ => ()) @@ -47,10 +56,8 @@ object File { if (Files.isDirectory(path)) deleteDirectory(path) else deleteFile(path).map(_ => 1) - def withTempDir[F[_]: Sync, A](parent: Path, prefix: String)( - f: Path => Stream[F, A] - ): Stream[F, A] = - Stream.bracket(mkTempDir(parent, prefix))(p => delete(p).map(_ => ())).flatMap(f) + def withTempDir[F[_]: Sync](parent: Path, prefix: String): Resource[F, Path] = + Resource.make(mkTempDir(parent, prefix))(p => delete(p).map(_ => ())) def listFiles[F[_]: Sync](pred: Path => Boolean, dir: Path): F[List[Path]] = Sync[F].delay { val javaList = @@ -58,4 +65,11 @@ object File { javaList.asScala.toList.sortBy(_.getFileName.toString) } + def readAll[F[_]: Sync: ContextShift](file: Path, blocker: Blocker, chunkSize: Int): Stream[F, Byte] = + fs2.io.file.readAll(file, blocker, chunkSize) + + def readText[F[_]: Sync: ContextShift](file: Path, blocker: Blocker): F[String] = + readAll[F](file, blocker, 8192). + through(fs2.text.utf8Decode). + compile.foldMonoid } diff --git a/modules/common/src/main/scala/docspell/common/LenientUri.scala b/modules/common/src/main/scala/docspell/common/LenientUri.scala index 03632818..8e9959d7 100644 --- a/modules/common/src/main/scala/docspell/common/LenientUri.scala +++ b/modules/common/src/main/scala/docspell/common/LenientUri.scala @@ -65,6 +65,11 @@ case class LenientUri( fs2.io.readInputStream(Sync[F].delay(url.openStream()), chunkSize, blocker, true) ) + def readText[F[_]: Sync: ContextShift](chunkSize: Int, blocker: Blocker): F[String] = + readURL[F](chunkSize, blocker). + through(fs2.text.utf8Decode). + compile.foldMonoid + def host: Option[String] = authority.map(a => a.indexOf(':') match { diff --git a/modules/common/src/main/scala/docspell/common/Logger.scala b/modules/common/src/main/scala/docspell/common/Logger.scala new file mode 100644 index 00000000..e95264b0 --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/Logger.scala @@ -0,0 +1,41 @@ +package docspell.common + +import cats.effect.Sync +import docspell.common.syntax.all._ +import org.log4s.{Logger => Log4sLogger} + +trait Logger[F[_]] { + + def trace(msg: => String): F[Unit] + def debug(msg: => String): F[Unit] + def info(msg: => String): F[Unit] + def warn(msg: => String): F[Unit] + def error(ex: Throwable)(msg: => String): F[Unit] + def error(msg: => String): F[Unit] + +} + +object Logger { + + + def log4s[F[_]: Sync](log: Log4sLogger): Logger[F] = new Logger[F] { + def trace(msg: => String): F[Unit] = + log.ftrace(msg) + + def debug(msg: => String): F[Unit] = + log.fdebug(msg) + + def info(msg: => String): F[Unit] = + log.finfo(msg) + + def warn(msg: => String): F[Unit] = + log.fwarn(msg) + + def error(ex: Throwable)(msg: => String): F[Unit] = + log.ferror(ex)(msg) + + def error(msg: => String): F[Unit] = + log.ferror(msg) + } + +} \ No newline at end of file diff --git a/modules/common/src/main/scala/docspell/common/MimeType.scala b/modules/common/src/main/scala/docspell/common/MimeType.scala index 7e6e6647..bffbb667 100644 --- a/modules/common/src/main/scala/docspell/common/MimeType.scala +++ b/modules/common/src/main/scala/docspell/common/MimeType.scala @@ -27,7 +27,7 @@ object MimeType { MimeType("image", partFromString(sub).throwLeft) private[this] val validChars: Set[Char] = - (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-").toSet + (('A' to 'Z') ++ ('a' to 'z') ++ ('0' to '9') ++ "*-.+").toSet def parse(str: String): Either[String, MimeType] = str.indexOf('/') match { @@ -44,10 +44,11 @@ object MimeType { private def partFromString(s: String): Either[String, String] = if (s.forall(validChars.contains)) Right(s) - else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.mkString}") + else Left(s"Invalid identifier: $s. Allowed chars: ${validChars.toList.sorted.mkString}") val octetStream = application("octet-stream") val pdf = application("pdf") + val zip = application("zip") val png = image("png") val jpeg = image("jpeg") val tiff = image("tiff") diff --git a/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala new file mode 100644 index 00000000..4199a29f --- /dev/null +++ b/modules/common/src/main/scala/docspell/common/MimeTypeHint.scala @@ -0,0 +1,20 @@ +package docspell.common + +case class MimeTypeHint(filename: Option[String], advertised: Option[String]) { + + def withName(name: String): MimeTypeHint = + copy(filename = Some(name)) +} + +object MimeTypeHint { + val none = MimeTypeHint(None, None) + + def filename(name: String): MimeTypeHint = + MimeTypeHint(Some(name), None) + + def advertised(mimeType: MimeType): MimeTypeHint = + advertised(mimeType.asString) + + def advertised(mimeType: String): MimeTypeHint = + MimeTypeHint(None, Some(mimeType)) +} diff --git a/modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala b/modules/common/src/main/scala/docspell/common/SystemCommand.scala similarity index 65% rename from modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala rename to modules/common/src/main/scala/docspell/common/SystemCommand.scala index f433c967..075c2dc7 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/SystemCommand.scala +++ b/modules/common/src/main/scala/docspell/common/SystemCommand.scala @@ -1,35 +1,54 @@ -package docspell.text.ocr +package docspell.common import java.io.InputStream +import java.lang.ProcessBuilder.Redirect import java.nio.file.Path import java.util.concurrent.TimeUnit + import cats.implicits._ import cats.effect.{Blocker, ContextShift, Sync} import fs2.{Stream, io, text} -import org.log4s.getLogger + import scala.jdk.CollectionConverters._ -import docspell.common.syntax.all._ object SystemCommand { - private[this] val logger = getLogger + final case class Config(program: String, args: Seq[String], timeout: Duration) { + + def mapArgs(f: String => String): Config = + Config(program, args.map(f), timeout) + + def replace(repl: Map[String, String]): Config = + mapArgs(s => + repl.foldLeft(s) { + case (res, (k, v)) => + res.replace(k, v) + }) + + def toCmd: List[String] = + program :: args.toList + + lazy val cmdString: String = + toCmd.mkString(" ") + } final case class Result(rc: Int, stdout: String, stderr: String) def exec[F[_]: Sync: ContextShift]( - cmd: Config.Command, + cmd: Config, blocker: Blocker, + logger: Logger[F], wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty ): Stream[F, Result] = - startProcess(cmd, wd) { proc => + startProcess(cmd, wd, logger, stdin) { proc => Stream.eval { for { _ <- writeToProcess(stdin, proc, blocker) term <- Sync[F].delay(proc.waitFor(cmd.timeout.seconds, TimeUnit.SECONDS)) - _ <- if (term) logger.fdebug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") + _ <- if (term) logger.debug(s"Command `${cmd.cmdString}` finished: ${proc.exitValue}") else - logger.fwarn( + logger.warn( s"Command `${cmd.cmdString}` did not finish in ${cmd.timeout.formatExact}!" ) _ <- if (!term) timeoutError(proc, cmd) else Sync[F].pure(()) @@ -40,12 +59,13 @@ object SystemCommand { } def execSuccess[F[_]: Sync: ContextShift]( - cmd: Config.Command, + cmd: Config, blocker: Blocker, + logger: Logger[F], wd: Option[Path] = None, stdin: Stream[F, Byte] = Stream.empty ): Stream[F, Result] = - exec(cmd, blocker, wd, stdin).flatMap { r => + exec(cmd, blocker, logger, wd, stdin).flatMap { r => if (r.rc != 0) Stream.raiseError[F]( new Exception( @@ -55,18 +75,23 @@ object SystemCommand { else Stream.emit(r) } - private def startProcess[F[_]: Sync, A](cmd: Config.Command, wd: Option[Path])( + private def startProcess[F[_]: Sync, A](cmd: Config, wd: Option[Path], logger: Logger[F], stdin: Stream[F, Byte])( f: Process => Stream[F, A] ): Stream[F, A] = { - val log = logger.fdebug(s"Running external command: ${cmd.cmdString}") - val proc = log *> Sync[F].delay { + val log = logger.debug(s"Running external command: ${cmd.cmdString}") + val hasStdin = stdin.take(1).compile.last.map(_.isDefined) + val proc = log *> hasStdin.flatMap(flag => Sync[F].delay { val pb = new ProcessBuilder(cmd.toCmd.asJava) + .redirectInput(if (flag) Redirect.PIPE else Redirect.INHERIT) + .redirectError(Redirect.PIPE) + .redirectOutput(Redirect.PIPE) + wd.map(_.toFile).foreach(pb.directory) pb.start() - } + }) Stream .bracket(proc)(p => - logger.fdebug(s"Closing process: `${cmd.cmdString}`").map { _ => + logger.debug(s"Closing process: `${cmd.cmdString}`").map { _ => p.destroy() } ) @@ -93,7 +118,7 @@ object SystemCommand { ): F[Unit] = data.through(io.writeOutputStream(Sync[F].delay(proc.getOutputStream), blocker)).compile.drain - private def timeoutError[F[_]: Sync](proc: Process, cmd: Config.Command): F[Unit] = + private def timeoutError[F[_]: Sync](proc: Process, cmd: Config): F[Unit] = Sync[F].delay(proc.destroyForcibly()).attempt *> { Sync[F].raiseError( new Exception(s"Command `${cmd.cmdString}` timed out (${cmd.timeout.formatExact})") diff --git a/modules/convert/src/main/scala/docspell/convert/Conversion.scala b/modules/convert/src/main/scala/docspell/convert/Conversion.scala new file mode 100644 index 00000000..18d62517 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/Conversion.scala @@ -0,0 +1,121 @@ +package docspell.convert + +import java.nio.charset.StandardCharsets + +import fs2._ +import cats.effect._ +import cats.implicits._ +import docspell.common._ +import docspell.convert.ConversionResult.Handler +import docspell.convert.extern.{Tesseract, Unoconv, WkHtmlPdf} +import docspell.convert.flexmark.Markdown +import docspell.files.{ImageSize, TikaMimetype} + +trait Conversion[F[_]] { + + def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] + +} + +object Conversion { + + def create[F[_]: Sync: ContextShift]( + cfg: ConvertConfig, + blocker: Blocker, + logger: Logger[F] + ): Resource[F, Conversion[F]] = + Resource.pure(new Conversion[F] { + + def toPDF[A](dataType: DataType, lang: Language, handler: Handler[F, A])(in: Stream[F, Byte]): F[A] = + TikaMimetype.resolve(dataType, in).flatMap { + case MimeType.pdf => + handler.run(ConversionResult.successPdf(in)) + + case MimeType.html => + WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(in, handler) + + case Texts(_) => + Markdown.toHtml(in, cfg.markdown).flatMap { html => + val bytes = Stream + .chunk(Chunk.bytes(html.getBytes(StandardCharsets.UTF_8))) + .covary[F] + WkHtmlPdf.toPDF(cfg.wkhtmlpdf, cfg.chunkSize, blocker, logger)(bytes, handler) + } + + case Images(mt) => + ImageSize.get(in).flatMap { + case Some(dim) => + if (dim.product > cfg.maxImageSize) { + logger + .info(s"Image size (${dim.product}) is too large (max ${cfg.maxImageSize}).") *> + handler.run( + ConversionResult.inputMalformed( + mt, + s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.maxImageSize})." + ) + ) + } else { + Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler) + } + + case None => + logger.info( + s"Cannot read image when determining size for ${mt.asString}. Converting anyways." + ) *> + Tesseract.toPDF(cfg.tesseract, lang, cfg.chunkSize, blocker, logger)(in, handler) + } + + case Office(_) => + Unoconv.toPDF(cfg.unoconv, cfg.chunkSize, blocker, logger)(in, handler) + + case mt => + handler.run(ConversionResult.unsupportedFormat(mt)) + } + }) + + object Images { + + val all = Set(MimeType.jpeg, MimeType.png, MimeType.tiff) + + def unapply(m: MimeType): Option[MimeType] = + Some(m).filter(all.contains) + } + + object Texts { + def unapply(m: MimeType): Option[MimeType] = + Some(m).filter(_.primary == "text") + } + + object Office { + val odt = MimeType.application("vnd.oasis.opendocument.text") + val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet") + val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text") + val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet") + val msoffice = MimeType.application("x-tika-msoffice") + val ooxml = MimeType.application("x-tika-ooxml") + val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document") + val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet") + val xls = MimeType.application("vnd.ms-excel") + val doc = MimeType.application("msword") + val rtf = MimeType.application("rtf") + + // without a filename, tika returns application/zip for odt/ods files, since + // they are just zip files + val odfContainer = MimeType.zip + + val all = + Set(odt, ods, odtAlias, odsAlias, msoffice, ooxml, docx, xlsx, xls, doc, rtf, odfContainer) + + def unapply(m: MimeType): Option[MimeType] = + Some(m).filter(all.contains) + } + + def unapply(mt: MimeType): Option[MimeType] = + mt match { + case Office(_) => Some(mt) + case Texts(_) => Some(mt) + case Images(_) => Some(mt) + case MimeType.html => Some(mt) + case _ => None + } +} diff --git a/modules/convert/src/main/scala/docspell/convert/ConversionResult.scala b/modules/convert/src/main/scala/docspell/convert/ConversionResult.scala new file mode 100644 index 00000000..dee9e9e0 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/ConversionResult.scala @@ -0,0 +1,53 @@ +package docspell.convert + +import cats.data.Kleisli +import fs2.Stream +import docspell.common.MimeType + +sealed trait ConversionResult[F[_]] { + + def pdfData: Stream[F, Byte] + +} + +object ConversionResult { + + /** The conversion is done by external tools that write files to the + * file system. These are temporary files and they will be deleted + * once the process finishes. This handler is used to do something + * relevant with the resulting files. + */ + type Handler[F[_], A] = Kleisli[F, ConversionResult[F], A] + + def unsupportedFormat[F[_]](mime: MimeType): ConversionResult[F] = + UnsupportedFormat[F](mime) + + def failure[F[_]](ex: Throwable): ConversionResult[F] = + Failure[F](ex) + + def successPdf[F[_]](pdf: Stream[F, Byte]): ConversionResult[F] = + SuccessPdf[F](pdf) + + def successPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]): ConversionResult[F] = + SuccessPdfTxt[F](pdf, txt) + + def inputMalformed[F[_]](mimeType: MimeType, reason: String): ConversionResult[F] = + InputMalformed(mimeType, reason) + + case class UnsupportedFormat[F[_]](mime: MimeType) extends ConversionResult[F] { + val pdfData = Stream.empty + } + case class Failure[F[_]](ex: Throwable) extends ConversionResult[F] { + val pdfData = Stream.empty + } + case class SuccessPdf[F[_]](pdf: Stream[F, Byte]) extends ConversionResult[F] { + val pdfData = pdf + } + case class SuccessPdfTxt[F[_]](pdf: Stream[F, Byte], txt: F[String]) extends ConversionResult[F] { + val pdfData = pdf + } + + case class InputMalformed[F[_]](mimeType: MimeType, reason: String) extends ConversionResult[F] { + val pdfData = Stream.empty + } +} diff --git a/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala new file mode 100644 index 00000000..887fe218 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/ConvertConfig.scala @@ -0,0 +1,11 @@ +package docspell.convert + +import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} +import docspell.convert.flexmark.MarkdownConfig + +case class ConvertConfig(chunkSize: Int, + maxImageSize: Int, + markdown: MarkdownConfig, + wkhtmlpdf: WkHtmlPdfConfig, + tesseract: TesseractConfig, + unoconv: UnoconvConfig) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala new file mode 100644 index 00000000..bf682287 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/ExternConv.scala @@ -0,0 +1,120 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import cats.implicits._ +import cats.effect._ +import fs2.{Pipe, Stream} +import docspell.common._ +import docspell.convert.ConversionResult +import docspell.convert.ConversionResult.{Handler, successPdf, successPdfTxt} + +private[extern] object ExternConv { + + def toPDF[F[_]: Sync: ContextShift, A]( + name: String, + cmdCfg: SystemCommand.Config, + wd: Path, + useStdin: Boolean, + blocker: Blocker, + logger: Logger[F], + reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = + Stream.resource(File.withTempDir[F](wd, s"docspell-$name")).flatMap { dir => + val inFile = dir.resolve("infile").toAbsolutePath.normalize + val out = dir.resolve("out.pdf").toAbsolutePath.normalize + val sysCfg = + cmdCfg.replace( + Map( + "{{outfile}}" -> out.toString + ) ++ + (if (!useStdin) Map("{{infile}}" -> inFile.toString) + else Map.empty) + ) + + val createInput: Pipe[F, Byte, Unit] = + if (useStdin) _ => Stream.emit(()) + else storeDataToFile(name, blocker, logger, inFile) + + in.through(createInput).flatMap { _ => + SystemCommand + .execSuccess[F](sysCfg, blocker, logger, Some(dir), if (useStdin) in else Stream.empty) + .evalMap(result => + logResult(name, result, logger). + flatMap(_ => reader(out, result)). + flatMap(handler.run) + ) + } + }.compile.lastOrError + + def readResult[F[_]: Sync: ContextShift]( + blocker: Blocker, + chunkSize: Int, + logger: Logger[F] + )(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = + File.existsNonEmpty[F](out).flatMap { + case true => + if (result.rc == 0) successPdf(File.readAll(out, blocker, chunkSize)).pure[F] + else + logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *> + successPdf(File.readAll(out, blocker, chunkSize)).pure[F] + + case false => + ConversionResult.failure[F]( + new Exception(s"Command result=${result.rc}. No output file found.") + ).pure[F] + } + + def readResultTesseract[F[_]: Sync: ContextShift]( + outPrefix: String, + blocker: Blocker, + chunkSize: Int, + logger: Logger[F] + )(out: Path, result: SystemCommand.Result): F[ConversionResult[F]] = { + val outPdf = out.resolveSibling(s"$outPrefix.pdf") + File.existsNonEmpty[F](outPdf).flatMap { + case true => + val outTxt = out.resolveSibling(s"$outPrefix.txt") + File.exists(outTxt).flatMap(txtExists => { + val pdfData = File.readAll(out, blocker, chunkSize) + if (result.rc == 0) { + if (txtExists) successPdfTxt(pdfData, File.readText(outTxt, blocker)).pure[F] + else successPdf(pdfData).pure[F] + } else { + logger.warn(s"Command not successful (rc=${result.rc}), but file exists.") *> + successPdf(pdfData).pure[F] + } + }) + + case false => + ConversionResult.failure[F]( + new Exception(s"Command result=${result.rc}. No output file found.") + ).pure[F] + } + } + + private def storeDataToFile[F[_]: Sync: ContextShift]( + name: String, + blocker: Blocker, + logger: Logger[F], + inFile: Path + ): Pipe[F, Byte, Unit] = + in => + Stream.eval(logger.debug(s"Storing input to file ${inFile} for running $name")).drain ++ + Stream.eval(storeFile(in, inFile, blocker)) + + private def logResult[F[_]: Sync]( + name: String, + result: SystemCommand.Result, + logger: Logger[F] + ): F[Unit] = + logger.debug(s"$name stdout: ${result.stdout}") *> + logger.debug(s"$name stderr: ${result.stderr}") + + private def storeFile[F[_]: Sync: ContextShift]( + in: Stream[F, Byte], + target: Path, + blocker: Blocker + ): F[Unit] = + in.through(fs2.io.file.writeAll(target, blocker)).compile.drain +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala new file mode 100644 index 00000000..1a6b966d --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/Tesseract.scala @@ -0,0 +1,27 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import cats.effect._ +import fs2.Stream +import docspell.common._ +import docspell.convert.ConversionResult +import docspell.convert.ConversionResult.Handler + +object Tesseract { + + def toPDF[F[_]: Sync: ContextShift, A]( + cfg: TesseractConfig, + lang: Language, + chunkSize: Int, + blocker: Blocker, + logger: Logger[F] + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { + val outBase = cfg.command.args.tail.headOption.getOrElse("out") + val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + ExternConv.readResultTesseract[F](outBase, blocker, chunkSize, logger) + + ExternConv.toPDF[F, A]("tesseract", cfg.command.replace(Map("{{lang}}" -> lang.iso3)), cfg.workingDir, false, blocker, logger, reader)(in, handler) + } + +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala new file mode 100644 index 00000000..51f25c23 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/TesseractConfig.scala @@ -0,0 +1,7 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import docspell.common.SystemCommand + +case class TesseractConfig (command: SystemCommand.Config, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala new file mode 100644 index 00000000..0f362428 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/Unoconv.scala @@ -0,0 +1,25 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import cats.effect._ +import fs2.Stream +import docspell.common._ +import docspell.convert.ConversionResult +import docspell.convert.ConversionResult.Handler + +object Unoconv { + + def toPDF[F[_]: Sync: ContextShift, A]( + cfg: UnoconvConfig, + chunkSize: Int, + blocker: Blocker, + logger: Logger[F] + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { + val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + ExternConv.readResult[F](blocker, chunkSize, logger) + + ExternConv.toPDF[F, A]("unoconv", cfg.command, cfg.workingDir, false, blocker, logger, reader)(in, handler) + } + +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala new file mode 100644 index 00000000..70fd7975 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/UnoconvConfig.scala @@ -0,0 +1,7 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import docspell.common.SystemCommand + +case class UnoconvConfig (command: SystemCommand.Config, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala new file mode 100644 index 00000000..7b70a78f --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdf.scala @@ -0,0 +1,25 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import cats.effect._ +import fs2.Stream +import docspell.common._ +import docspell.convert.ConversionResult +import docspell.convert.ConversionResult.Handler + +object WkHtmlPdf { + + def toPDF[F[_]: Sync: ContextShift, A]( + cfg: WkHtmlPdfConfig, + chunkSize: Int, + blocker: Blocker, + logger: Logger[F], + )(in: Stream[F, Byte], handler: Handler[F, A]): F[A] = { + val reader: (Path, SystemCommand.Result) => F[ConversionResult[F]] = + ExternConv.readResult[F](blocker, chunkSize, logger) + + ExternConv.toPDF[F, A]("wkhtmltopdf", cfg.command, cfg.workingDir, true, blocker, logger, reader)(in, handler) + } + +} diff --git a/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala new file mode 100644 index 00000000..3be06951 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/extern/WkHtmlPdfConfig.scala @@ -0,0 +1,7 @@ +package docspell.convert.extern + +import java.nio.file.Path + +import docspell.common.SystemCommand + +case class WkHtmlPdfConfig (command: SystemCommand.Config, workingDir: Path) diff --git a/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala b/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala new file mode 100644 index 00000000..f895e44f --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/flexmark/Markdown.scala @@ -0,0 +1,71 @@ +package docspell.convert.flexmark + +import java.io.{InputStream, InputStreamReader} +import java.nio.charset.StandardCharsets +import java.util + +import cats.effect.Sync +import cats.implicits._ +import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension +import com.vladsch.flexmark.ext.tables.TablesExtension +import com.vladsch.flexmark.html.HtmlRenderer +import com.vladsch.flexmark.parser.Parser +import com.vladsch.flexmark.util.data.{DataKey, MutableDataSet} +import fs2.Stream + +import scala.util.Try + +object Markdown { + + def toHtml(is: InputStream, cfg: MarkdownConfig): Either[Throwable, String] = { + val p = createParser() + val r = createRenderer() + Try { + val reader = new InputStreamReader(is, StandardCharsets.UTF_8) + val doc = p.parseReader(reader) + wrapHtml(r.render(doc), cfg) + }.toEither + } + + + def toHtml(md: String, cfg: MarkdownConfig): String = { + val p = createParser() + val r = createRenderer() + val doc = p.parse(md) + wrapHtml(r.render(doc), cfg) + } + + def toHtml[F[_]: Sync](data: Stream[F, Byte], cfg: MarkdownConfig): F[String] = + data.through(fs2.text.utf8Decode).compile.foldMonoid. + map(str => toHtml(str, cfg)) + + private def wrapHtml(body: String, cfg: MarkdownConfig): String = { + s""" + | + | + | + | + | + | + |$body + | + | + |""".stripMargin + } + + private def createParser(): Parser = { + val opts = new MutableDataSet() + opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]], + util.Arrays.asList(TablesExtension.create(), + StrikethroughExtension.create())); + + Parser.builder(opts).build() + } + + private def createRenderer(): HtmlRenderer = { + val opts = new MutableDataSet() + HtmlRenderer.builder(opts).build() + } +} diff --git a/modules/convert/src/main/scala/docspell/convert/flexmark/MarkdownConfig.scala b/modules/convert/src/main/scala/docspell/convert/flexmark/MarkdownConfig.scala new file mode 100644 index 00000000..3d0a5ab3 --- /dev/null +++ b/modules/convert/src/main/scala/docspell/convert/flexmark/MarkdownConfig.scala @@ -0,0 +1,3 @@ +package docspell.convert.flexmark + +case class MarkdownConfig(internalCss: String) diff --git a/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala new file mode 100644 index 00000000..3c6eebc5 --- /dev/null +++ b/modules/convert/src/test/scala/docspell/convert/ConversionTest.scala @@ -0,0 +1,160 @@ +package docspell.convert + +import java.nio.file.Paths + +import cats.data.Kleisli +import cats.implicits._ +import cats.effect.IO +import fs2.Stream +import docspell.common._ +import docspell.convert.ConversionResult.Handler +import docspell.convert.extern.{TesseractConfig, UnoconvConfig, WkHtmlPdfConfig} +import docspell.convert.flexmark.MarkdownConfig +import docspell.files.{ExampleFiles, TestFiles} +import minitest.SimpleTestSuite + +object ConversionTest extends SimpleTestSuite with FileChecks { + val blocker = TestFiles.blocker + implicit val CS = TestFiles.CS + + val logger = Logger.log4s[IO](org.log4s.getLogger) + val target = Paths.get("target") + + val convertConfig = ConvertConfig( + 8192, + 3000 * 3000, + MarkdownConfig("body { padding: 2em 5em; }"), + WkHtmlPdfConfig( + SystemCommand.Config( + "wkhtmltopdf", + Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"), + Duration.seconds(20) + ), + target + ), + TesseractConfig( + SystemCommand.Config( + "tesseract", + Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"), + Duration.seconds(20) + ), + target + ), + UnoconvConfig( + SystemCommand.Config( + "unoconv", + Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"), + Duration.seconds(20) + ), + target + ) + ) + + val conversion = Conversion.create[IO](convertConfig, blocker, logger) + + val bombs = List( + ExampleFiles.bombs_20K_gray_jpeg, + ExampleFiles.bombs_20K_gray_png, + ExampleFiles.bombs_20K_rgb_jpeg, + ExampleFiles.bombs_20K_rgb_png + ) + val pdfOnly = List( + ExampleFiles.examples_sample_ods, + ExampleFiles.examples_sample_doc, + ExampleFiles.examples_sample_docx, + ExampleFiles.examples_sample_ods, + ExampleFiles.examples_sample_odt, + ExampleFiles.examples_sample_rtf, + ExampleFiles.examples_sample_xls, + ExampleFiles.examples_sample_xlsx, + ExampleFiles.letter_de_md, + ExampleFiles.letter_de_txt, + ExampleFiles.letter_en_txt, + ExampleFiles.letter_de_html + ) + val pdfAndTxt = List( + ExampleFiles.camera_letter_en_jpg, + ExampleFiles.camera_letter_en_png, + ExampleFiles.camera_letter_en_tiff, + ExampleFiles.scanner_jfif_jpg + ) + + test("convert to pdf") { + if (!commandsExist) ignore("At least one of the conversion programs not found") + else + File + .withTempDir[IO](target, "convpdf") + .use { dir => + conversion.use { conv => + def check(n: Long): Handler[IO, Unit] = + storePdfHandler(dir.resolve(s"test-$n.pdf")).map { p => + assert(p.isNonEmpty && p.isPDF) + } + + runConversion(pdfOnly, check, conv).compile.drain + } + } + .unsafeRunSync() + } + + test("convert image to pdf and txt") { + if (!commandsExist) ignore("At least one of the conversion programs not found") + else + File + .withTempDir[IO](target, "convimgpdf") + .use { dir => + conversion.use { conv => + def check(n: Long): Handler[IO, Unit] = + storePdfTxtHandler(dir.resolve(s"test-$n.pdf"), dir.resolve(s"test-$n.txt")) + .map { + case (p, t) => + assert(p.isNonEmpty && p.isPDF) + assert(t.isNonEmpty && t.isPlainText) + } + + runConversion(pdfAndTxt, check, conv).compile.drain + } + } + .unsafeRunSync() + } + + test("do not convert image bombs") { + if (!commandsExist) ignore("At least one of the conversion programs not found") + else + conversion + .use { conv => + def check: Handler[IO, Unit] = + Kleisli({ + case ConversionResult.InputMalformed(_, _) => + ().pure[IO] + case cr => + IO.raiseError(new Exception(s"Unexpected result: $cr")) + }) + + runConversion(bombs, _ => check, conv).compile.drain + } + .unsafeRunSync() + } + + def runConversion[A]( + uris: List[LenientUri], + handler: Long => Handler[IO, A], + conv: Conversion[IO] + ) = + Stream + .emits(uris) + .covary[IO] + .zipWithIndex + .evalMap({ + case (uri, index) => + val load = uri.readURL[IO](8192, blocker) + val dataType = DataType.filename(uri.path.segments.last) + logger.info(s"Processing file ${uri.path.asString}") *> + conv.toPDF(dataType, Language.German, handler(index))(load) + }) + + def commandsExist: Boolean = + commandExists(convertConfig.unoconv.command.program) && + commandExists(convertConfig.wkhtmlpdf.command.program) && + commandExists(convertConfig.tesseract.command.program) +} diff --git a/modules/convert/src/test/scala/docspell/convert/FileChecks.scala b/modules/convert/src/test/scala/docspell/convert/FileChecks.scala new file mode 100644 index 00000000..52254fbb --- /dev/null +++ b/modules/convert/src/test/scala/docspell/convert/FileChecks.scala @@ -0,0 +1,59 @@ +package docspell.convert + +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Path} + +import cats.data.Kleisli +import cats.effect.IO +import fs2.{Pipe, Stream} +import docspell.common.MimeType +import docspell.convert.ConversionResult.Handler +import docspell.files.TikaMimetype + +trait FileChecks { + + implicit class FileCheckOps(p: Path) { + + def isNonEmpty: Boolean = + Files.exists(p) && Files.size(p) > 0 + + def isType(mime: MimeType): Boolean = + TikaMimetype.detect[IO](p).map(_ == mime).unsafeRunSync + + def isPDF: Boolean = + isType(MimeType.pdf) + + def isPlainText: Boolean = + isType(MimeType.text("plain")) + } + + def storeFile(file: Path): Pipe[IO, Byte, Path] = + in => Stream.eval(in.compile.to(Array).flatMap(bytes => IO(Files.write(file, bytes)))) + + def storePdfHandler(file: Path): Handler[IO, Path] = + storePdfTxtHandler(file, file.resolveSibling("unexpected.txt")).map(_._1) + + def storePdfTxtHandler(filePdf: Path, fileTxt: Path): Handler[IO, (Path, Path)] = + Kleisli({ + case ConversionResult.SuccessPdfTxt(pdf, txt) => + for { + pout <- pdf.through(storeFile(filePdf)).compile.lastOrError + str <- txt + tout <- IO(Files.write(fileTxt, str.getBytes(StandardCharsets.UTF_8))) + } yield (pout, tout) + + case ConversionResult.SuccessPdf(pdf) => + pdf.through(storeFile(filePdf)).compile.lastOrError.map(p => (p, fileTxt)) + + case ConversionResult.Failure(ex) => + throw new Exception(s"Unexpected result (failure: ${ex.getMessage})", ex) + + case cr => + throw new Exception(s"Unexpected result: $cr") + }) + + def commandExists(cmd: String): Boolean = + Runtime.getRuntime.exec(Array("which", cmd)).waitFor() == 0 + + +} diff --git a/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala new file mode 100644 index 00000000..a2f496ec --- /dev/null +++ b/modules/convert/src/test/scala/docspell/convert/extern/ExternConvTest.scala @@ -0,0 +1,107 @@ +package docspell.convert.extern + +import java.nio.file.{Path, Paths} + +import cats.effect._ +import docspell.common._ +import docspell.convert.FileChecks +import docspell.files.{ExampleFiles, TestFiles} +import minitest.SimpleTestSuite + +object ExternConvTest extends SimpleTestSuite with FileChecks { + val blocker = TestFiles.blocker + implicit val CS = TestFiles.CS + + val logger = Logger.log4s[IO](org.log4s.getLogger) + val target = Paths.get("target") + + test("convert html to pdf") { + val cfg = SystemCommand.Config( + "wkhtmltopdf", + Seq("-s", "A4", "--encoding", "UTF-8", "-", "{{outfile}}"), + Duration.seconds(20) + ) + + if (!commandExists(cfg.program)) ignore(s"Command ${cfg.program} not found") + else { + File + .withTempDir[IO](target, "wkhtmltopdf") + .use(dir => + IO { + val wkCfg = WkHtmlPdfConfig(cfg, target) + val p = + WkHtmlPdf + .toPDF[IO, Path](wkCfg, 8192, blocker, logger)( + ExampleFiles.letter_de_html.readURL[IO](8192, blocker), + storePdfHandler(dir.resolve("test.pdf")) + ) + .unsafeRunSync() + + assert(p.isNonEmpty && p.isPDF) + } + ) + .unsafeRunSync + } + } + + test("convert office to pdf") { + val cfg = SystemCommand.Config( + "unoconv", + Seq("-f", "pdf", "-o", "{{outfile}}", "{{infile}}"), + Duration.seconds(20) + ) + + if (!commandExists(cfg.program)) ignore(s"Command ${cfg.program} not found") + else { + File + .withTempDir[IO](target, "unoconv") + .use(dir => + IO { + val ucCfg = UnoconvConfig(cfg, target) + val p = + Unoconv + .toPDF[IO, Path](ucCfg, 8192, blocker, logger)( + ExampleFiles.examples_sample_docx.readURL[IO](8192, blocker), + storePdfHandler(dir.resolve("test.pdf")) + ) + .unsafeRunSync() + + assert(p.isNonEmpty && p.isPDF) + } + ) + .unsafeRunSync + } + } + + test("convert image to pdf") { + val cfg = SystemCommand.Config( + "tesseract", + Seq("{{infile}}", "out", "-l", "deu", "pdf", "txt"), + Duration.seconds(20) + ) + + if (!commandExists(cfg.program)) ignore(s"Command ${cfg.program} not found") + else { + File + .withTempDir[IO](target, "tesseract") + .use(dir => + IO { + val tessCfg = TesseractConfig(cfg, target) + val (pdf, txt) = + Tesseract + .toPDF[IO, (Path, Path)](tessCfg, Language.German, 8192, blocker, logger)( + ExampleFiles.camera_letter_en_jpg.readURL[IO](8192, blocker), + storePdfTxtHandler(dir.resolve("test.pdf"), dir.resolve("test.txt")) + ) + .unsafeRunSync() + + assert(pdf.isNonEmpty && pdf.isPDF) + assert(txt.isNonEmpty && txt.isPlainText) + } + ) + .unsafeRunSync + } + } + + +} diff --git a/modules/extract/NOTICE b/modules/extract/NOTICE new file mode 100644 index 00000000..05ccbbcc --- /dev/null +++ b/modules/extract/NOTICE @@ -0,0 +1,11 @@ +The Java source files in docspell-extract are unmodified copies of +those found in the Apache Tika parser project. It follows the +NOTICE.txt file from Apache Tika parsers: + +Apache Tika parsers +Copyright 2007-2019 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java new file mode 100644 index 00000000..80b2301c --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import org.apache.tika.sax.ContentHandlerDecorator; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Locale; + +/** + * Content handler decorator that: + */ +public class NSNormalizerContentHandler extends ContentHandlerDecorator { + + private static final String OLD_NS = + "http://openoffice.org/2000/"; + + private static final String NEW_NS = + "urn:oasis:names:tc:opendocument:xmlns:"; + + private static final String DTD_PUBLIC_ID = + "-//OpenOffice.org//DTD OfficeDocument 1.0//EN"; + + public NSNormalizerContentHandler(ContentHandler handler) { + super(handler); + } + + private String mapOldNS(String ns) { + if (ns != null && ns.startsWith(OLD_NS)) { + return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0"; + } else { + return ns; + } + } + + @Override + public void startElement( + String namespaceURI, String localName, String qName, + Attributes atts) throws SAXException { + AttributesImpl natts = new AttributesImpl(); + for (int i = 0; i < atts.getLength(); i++) { + natts.addAttribute( + mapOldNS(atts.getURI(i)), atts.getLocalName(i), + atts.getQName(i), atts.getType(i), atts.getValue(i)); + } + super.startElement(mapOldNS(namespaceURI), localName, qName, atts); + } + + @Override + public void endElement(String namespaceURI, String localName, String qName) + throws SAXException { + super.endElement(mapOldNS(namespaceURI), localName, qName); + } + + @Override + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + super.startPrefixMapping(prefix, mapOldNS(uri)); + } + + /** + * do not load any DTDs (may be requested by parser). Fake the DTD by + * returning a empty string as InputSource + */ + @Override + public InputSource resolveEntity(String publicId, String systemId) + throws IOException, SAXException { + if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd")) + || DTD_PUBLIC_ID.equals(publicId)) { + return new InputSource(new StringReader("")); + } else { + return super.resolveEntity(publicId, systemId); + } + } + +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java new file mode 100644 index 00000000..066f3e95 --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java @@ -0,0 +1,606 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.ElementMappingContentHandler; +import org.apache.tika.sax.ElementMappingContentHandler.TargetElement; +import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.XMLReaderUtils; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; +import org.xml.sax.helpers.DefaultHandler; + +import javax.xml.namespace.QName; +import java.io.IOException; +import java.io.InputStream; +import java.util.BitSet; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import static org.apache.tika.sax.XHTMLContentHandler.XHTML; + +/** + * Parser for ODF content.xml files. + */ +public class OpenDocumentContentParser extends AbstractParser { + private interface Style { + } + + private static class TextStyle implements Style { + public boolean italic; + public boolean bold; + public boolean underlined; + + @Override + public String toString() { + return "TextStyle{" + + "italic=" + italic + + ", bold=" + bold + + ", underlined=" + underlined + + '}'; + } + } + + private static class ListStyle implements Style { + public boolean ordered; + + public String getTag() { + return ordered ? "ol" : "ul"; + } + } + + private static final class OpenDocumentElementMappingContentHandler extends + ElementMappingContentHandler { + private static final char[] SPACE = new char[]{ ' '}; + private static final String CLASS = "class"; + private static final Attributes ANNOTATION_ATTRIBUTES = buildAttributes(CLASS, "annotation"); + private static final Attributes NOTE_ATTRIBUTES = buildAttributes(CLASS, "note"); + private static final Attributes NOTES_ATTRIBUTES = buildAttributes(CLASS, "notes"); + + private static Attributes buildAttributes(String key, String value) { + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", key, key, "CDATA", value); + return attrs; + } + + private final ContentHandler handler; + private final BitSet textNodeStack = new BitSet(); + private int nodeDepth = 0; + private int completelyFiltered = 0; + private Stack headingStack = new Stack(); + private Map paragraphTextStyleMap = new HashMap(); + private Map textStyleMap = new HashMap(); + private Map listStyleMap = new HashMap(); + private String currParagraphStyleName; //paragraph style name + private TextStyle currTextStyle; //this is the text style for particular spans/paragraphs + private String currTextStyleName; + + private Stack listStyleStack = new Stack(); + private ListStyle listStyle; + + // True if we are currently in the named style: + private boolean curUnderlined; + private boolean curBold; + private boolean curItalic; + + //have we written the start style tags + //yet for the current text style + boolean hasWrittenStartStyleTags = false; + + private int pDepth = 0; //

can appear inside comments and other things that are already inside

+ //we need to track our pDepth and only output

if we're at the main level + + + private OpenDocumentElementMappingContentHandler(ContentHandler handler, + Map mappings) { + super(handler, mappings); + this.handler = handler; + } + + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + // only forward content of tags from text:-namespace + if (completelyFiltered == 0 && nodeDepth > 0 + && textNodeStack.get(nodeDepth - 1)) { + if (!hasWrittenStartStyleTags) { + updateStyleTags(); + hasWrittenStartStyleTags = true; + } + super.characters(ch, start, length); + } + } + + // helper for checking tags which need complete filtering + // (with sub-tags) + private boolean needsCompleteFiltering( + String namespaceURI, String localName) { + if (TEXT_NS.equals(namespaceURI)) { + return localName.endsWith("-template") + || localName.endsWith("-style"); + } + return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName); + } + + // map the heading level to HTML tags + private String getXHTMLHeaderTagName(Attributes atts) { + String depthStr = atts.getValue(TEXT_NS, "outline-level"); + if (depthStr == null) { + return "h1"; + } + + int depth = Integer.parseInt(depthStr); + if (depth >= 6) { + return "h6"; + } else if (depth <= 1) { + return "h1"; + } else { + return "h" + depth; + } + } + + /** + * Check if a node is a text node + */ + private boolean isTextNode(String namespaceURI, String localName) { + if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) { + return true; + } + if (SVG_NS.equals(namespaceURI)) { + return "title".equals(localName) || + "desc".equals(localName); + } + return false; + } + + private void startList(String name) throws SAXException { + String elementName = "ul"; + if (name != null) { + ListStyle style = listStyleMap.get(name); + elementName = style != null ? style.getTag() : "ul"; + listStyleStack.push(style); + } + handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES); + } + + private void endList() throws SAXException { + String elementName = "ul"; + if (!listStyleStack.isEmpty()) { + ListStyle style = listStyleStack.pop(); + elementName = style != null ? style.getTag() : "ul"; + } + handler.endElement(XHTML, elementName, elementName); + } + + private void startSpan(String name) throws SAXException { + if (name == null) { + return; + } + currTextStyle = textStyleMap.get(name); + hasWrittenStartStyleTags = false; + } + + private void startParagraph(String styleName) throws SAXException { + if (pDepth == 0) { + handler.startElement(XHTML, "p", "p", EMPTY_ATTRIBUTES); + if (styleName != null) { + currTextStyle = paragraphTextStyleMap.get(styleName); + } + hasWrittenStartStyleTags = false; + } else { + handler.characters(SPACE, 0, SPACE.length); + } + pDepth++; + } + + private void endParagraph() throws SAXException { + closeStyleTags(); + if (pDepth == 1) { + handler.endElement(XHTML, "p", "p"); + } else { + handler.characters(SPACE, 0, SPACE.length); + } + pDepth--; + + } + + private void updateStyleTags() throws SAXException { + + if (currTextStyle == null) { + closeStyleTags(); + return; + } + if (currTextStyle.bold != curBold) { + // Enforce nesting -- must close s and i tags + if (curUnderlined) { + handler.endElement(XHTML, "u", "u"); + curUnderlined = false; + } + if (curItalic) { + handler.endElement(XHTML, "i", "i"); + curItalic = false; + } + if (currTextStyle.bold) { + handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES); + } else { + handler.endElement(XHTML, "b", "b"); + } + curBold = currTextStyle.bold; + } + + if (currTextStyle.italic != curItalic) { + // Enforce nesting -- must close s tag + if (curUnderlined) { + handler.endElement(XHTML, "u", "u"); + curUnderlined = false; + } + if (currTextStyle.italic) { + handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES); + } else { + handler.endElement(XHTML, "i", "i"); + } + curItalic = currTextStyle.italic; + } + + if (currTextStyle.underlined != curUnderlined) { + if (currTextStyle.underlined) { + handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES); + } else { + handler.endElement(XHTML, "u", "u"); + } + curUnderlined = currTextStyle.underlined; + } + } + + private void endSpan() throws SAXException { + updateStyleTags(); + } + + private void closeStyleTags() throws SAXException { + // Close any still open style tags + if (curUnderlined) { + handler.endElement(XHTML,"u", "u"); + curUnderlined = false; + } + if (curItalic) { + handler.endElement(XHTML,"i", "i"); + curItalic = false; + } + if (curBold) { + handler.endElement(XHTML,"b", "b"); + curBold = false; + } + currTextStyle = null; + hasWrittenStartStyleTags = false; + } + + @Override + public void startElement( + String namespaceURI, String localName, String qName, + Attributes attrs) throws SAXException { + // keep track of current node type. If it is a text node, + // a bit at the current depth its set in textNodeStack. + // characters() checks the top bit to determine, if the + // actual node is a text node to print out nodeDepth contains + // the depth of the current node and also marks top of stack. + assert nodeDepth >= 0; + + // Set styles + if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) { + String family = attrs.getValue(STYLE_NS, "family"); + if ("text".equals(family)) { + currTextStyle = new TextStyle(); + currTextStyleName = attrs.getValue(STYLE_NS, "name"); + } else if ("paragraph".equals(family)) { + currTextStyle = new TextStyle(); + currParagraphStyleName = attrs.getValue(STYLE_NS, "name"); + } + } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) { + listStyle = new ListStyle(); + String name = attrs.getValue(STYLE_NS, "name"); + listStyleMap.put(name, listStyle); + } else if (currTextStyle != null && STYLE_NS.equals(namespaceURI) + && "text-properties".equals(localName)) { + String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style"); + if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) { + currTextStyle.italic = true; + } + String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight"); + if ("bold".equals(fontWeight) || "bolder".equals(fontWeight) + || (fontWeight != null && Character.isDigit(fontWeight.charAt(0)) + && Integer.valueOf(fontWeight) > 500)) { + currTextStyle.bold = true; + } + String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style"); + if (underlineStyle != null && !underlineStyle.equals("none")) { + currTextStyle.underlined = true; + } + } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) { + if ("list-level-style-bullet".equals(localName)) { + listStyle.ordered = false; + } else if ("list-level-style-number".equals(localName)) { + listStyle.ordered = true; + } + } + + textNodeStack.set(nodeDepth++, + isTextNode(namespaceURI, localName)); + // filter *all* content of some tags + assert completelyFiltered >= 0; + + if (needsCompleteFiltering(namespaceURI, localName)) { + completelyFiltered++; + } + // call next handler if no filtering + if (completelyFiltered == 0) { + // special handling of text:h, that are directly passed + // to incoming handler + if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { + final String el = headingStack.push(getXHTMLHeaderTagName(attrs)); + handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES); + } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) { + startList(attrs.getValue(TEXT_NS, "style-name")); + } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { + startSpan(attrs.getValue(TEXT_NS, "style-name")); + } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) { + startParagraph(attrs.getValue(TEXT_NS, "style-name")); + } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) { + handler.characters(SPACE, 0, 1); + } else if ("annotation".equals(localName)) { + closeStyleTags(); + handler.startElement(XHTML, "span", "p", ANNOTATION_ATTRIBUTES); + } else if ("note".equals(localName)) { + closeStyleTags(); + handler.startElement(XHTML, "span", "p", NOTE_ATTRIBUTES); + } else if ("notes".equals(localName)) { + closeStyleTags(); + handler.startElement(XHTML, "span", "p", NOTES_ATTRIBUTES); + } else { + super.startElement(namespaceURI, localName, qName, attrs); + } + } + } + + @Override + public void endElement( + String namespaceURI, String localName, String qName) + throws SAXException { + if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) { + if (currTextStyle != null && currTextStyleName != null) { + textStyleMap.put(currTextStyleName, currTextStyle); + currTextStyleName = null; + currTextStyle = null; + } else if (currTextStyle != null && currParagraphStyleName != null) { + paragraphTextStyleMap.put(currParagraphStyleName, currTextStyle); + currParagraphStyleName = null; + currTextStyle = null; + } + } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) { + listStyle = null; + } + + // call next handler if no filtering + if (completelyFiltered == 0) { + // special handling of text:h, that are directly passed + // to incoming handler + if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { + final String el = headingStack.pop(); + handler.endElement(XHTMLContentHandler.XHTML, el, el); + } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) { + endList(); + } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { + currTextStyle = null; + hasWrittenStartStyleTags = false; + } else if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) { + endParagraph(); + } else if ("annotation".equals(localName) || "note".equals(localName) || + "notes".equals(localName)) { + closeStyleTags(); + handler.endElement("", localName, localName); + } else { + super.endElement(namespaceURI, localName, qName); + } + + // special handling of tabulators + if (TEXT_NS.equals(namespaceURI) + && ("tab-stop".equals(localName) + || "tab".equals(localName))) { + this.characters(TAB, 0, TAB.length); + } + } + + // revert filter for *all* content of some tags + if (needsCompleteFiltering(namespaceURI, localName)) { + completelyFiltered--; + } + assert completelyFiltered >= 0; + + // reduce current node depth + nodeDepth--; + assert nodeDepth >= 0; + } + + @Override + public void startPrefixMapping(String prefix, String uri) { + // remove prefix mappings as they should not occur in XHTML + } + + @Override + public void endPrefixMapping(String prefix) { + // remove prefix mappings as they should not occur in XHTML + } + } + + public static final String TEXT_NS = + "urn:oasis:names:tc:opendocument:xmlns:text:1.0"; + + public static final String TABLE_NS = + "urn:oasis:names:tc:opendocument:xmlns:table:1.0"; + + public static final String STYLE_NS = + "urn:oasis:names:tc:opendocument:xmlns:style:1.0"; + + public static final String FORMATTING_OBJECTS_NS = + "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0"; + + public static final String OFFICE_NS = + "urn:oasis:names:tc:opendocument:xmlns:office:1.0"; + + public static final String SVG_NS = + "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"; + + public static final String PRESENTATION_NS = + "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0"; + + public static final String DRAW_NS = + "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"; + + public static final String XLINK_NS = "http://www.w3.org/1999/xlink"; + + protected static final char[] TAB = new char[]{'\t'}; + + private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); + + /** + * Mappings between ODF tag names and XHTML tag names + * (including attributes). All other tag names/attributes are ignored + * and left out from event stream. + */ + private static final HashMap MAPPINGS = + new HashMap(); + + static { + // general mappings of text:-tags + MAPPINGS.put( + new QName(TEXT_NS, "p"), + new TargetElement(XHTML, "p")); + // text:h-tags are mapped specifically in startElement/endElement + MAPPINGS.put( + new QName(TEXT_NS, "line-break"), + new TargetElement(XHTML, "br")); + MAPPINGS.put( + new QName(TEXT_NS, "list-item"), + new TargetElement(XHTML, "li")); + MAPPINGS.put( + new QName(TEXT_NS, "note"), + new TargetElement(XHTML, "span")); + MAPPINGS.put( + new QName(OFFICE_NS, "annotation"), + new TargetElement(XHTML, "span")); + MAPPINGS.put( + new QName(PRESENTATION_NS, "notes"), + new TargetElement(XHTML, "span")); + MAPPINGS.put( + new QName(DRAW_NS, "object"), + new TargetElement(XHTML, "object")); + MAPPINGS.put( + new QName(DRAW_NS, "text-box"), + new TargetElement(XHTML, "div")); + MAPPINGS.put( + new QName(SVG_NS, "title"), + new TargetElement(XHTML, "span")); + MAPPINGS.put( + new QName(SVG_NS, "desc"), + new TargetElement(XHTML, "span")); + MAPPINGS.put( + new QName(TEXT_NS, "span"), + new TargetElement(XHTML, "span")); + + final HashMap aAttsMapping = + new HashMap(); + aAttsMapping.put( + new QName(XLINK_NS, "href"), + new QName("href")); + aAttsMapping.put( + new QName(XLINK_NS, "title"), + new QName("title")); + MAPPINGS.put( + new QName(TEXT_NS, "a"), + new TargetElement(XHTML, "a", aAttsMapping)); + + // create HTML tables from table:-tags + MAPPINGS.put( + new QName(TABLE_NS, "table"), + new TargetElement(XHTML, "table")); + // repeating of rows is ignored; for columns, see below! + MAPPINGS.put( + new QName(TABLE_NS, "table-row"), + new TargetElement(XHTML, "tr")); + // special mapping for rowspan/colspan attributes + final HashMap tableCellAttsMapping = + new HashMap(); + tableCellAttsMapping.put( + new QName(TABLE_NS, "number-columns-spanned"), + new QName("colspan")); + tableCellAttsMapping.put( + new QName(TABLE_NS, "number-rows-spanned"), + new QName("rowspan")); + /* TODO: The following is not correct, the cell should be repeated not spanned! + * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct. + * Problems may occur when both spanning and repeating is given, which is not allowed by spec. + * Cell spanning instead of repeating is not a problem, because OpenOffice uses it + * only for empty cells. + */ + tableCellAttsMapping.put( + new QName(TABLE_NS, "number-columns-repeated"), + new QName("colspan")); + MAPPINGS.put( + new QName(TABLE_NS, "table-cell"), + new TargetElement(XHTML, "td", tableCellAttsMapping)); + } + + public Set getSupportedTypes(ParseContext context) { + return Collections.emptySet(); // not a top-level parser + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + parseInternal(stream, + new XHTMLContentHandler(handler, metadata), + metadata, context); + } + + void parseInternal( + InputStream stream, final ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS); + + + XMLReaderUtils.parseSAX( + new CloseShieldInputStream(stream), + new OfflineContentHandler( + new NSNormalizerContentHandler(dh)), + context); + } + +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java new file mode 100644 index 00000000..11922d7d --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.DublinCore; +import org.apache.tika.metadata.MSOffice; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.PagedText; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.xml.AttributeDependantMetadataHandler; +import org.apache.tika.parser.xml.AttributeMetadataHandler; +import org.apache.tika.parser.xml.ElementMetadataHandler; +import org.apache.tika.parser.xml.MetadataHandler; +import org.apache.tika.parser.xml.XMLParser; +import org.apache.tika.sax.TeeContentHandler; +import org.apache.tika.sax.xpath.CompositeMatcher; +import org.apache.tika.sax.xpath.Matcher; +import org.apache.tika.sax.xpath.MatchingContentHandler; +import org.apache.tika.sax.xpath.XPathParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Parser for OpenDocument meta.xml files. + */ +public class OpenDocumentMetaParser extends XMLParser { + /** + * Serial version UID + */ + private static final long serialVersionUID = -8739250869531737584L; + + private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0"; + private static final XPathParser META_XPATH = new XPathParser("meta", META_NS); + + /** + * @see OfficeOpenXMLCore#SUBJECT + * @deprecated use OfficeOpenXMLCore#SUBJECT + */ + @Deprecated + private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR = + Property.composite(Office.INITIAL_AUTHOR, + new Property[]{Property.externalText("initial-creator")}); + + private static ContentHandler getDublinCoreHandler( + Metadata metadata, Property property, String element) { + return new ElementMetadataHandler( + DublinCore.NAMESPACE_URI_DC, element, + metadata, property); + } + + private static ContentHandler getMeta( + ContentHandler ch, Metadata md, Property property, String element) { + Matcher matcher = new CompositeMatcher( + META_XPATH.parse("//meta:" + element), + META_XPATH.parse("//meta:" + element + "//text()")); + ContentHandler branch = + new MatchingContentHandler(new MetadataHandler(md, property), matcher); + return new TeeContentHandler(ch, branch); + } + + private static ContentHandler getUserDefined( + ContentHandler ch, Metadata md) { + Matcher matcher = new CompositeMatcher( + META_XPATH.parse("//meta:user-defined/@meta:name"), + META_XPATH.parse("//meta:user-defined//text()")); + // eg Text1 becomes custom:Info1=Text1 + ContentHandler branch = new MatchingContentHandler( + new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX), + matcher); + return new TeeContentHandler(ch, branch); + } + + @Deprecated + private static ContentHandler getStatistic( + ContentHandler ch, Metadata md, String name, String attribute) { + Matcher matcher = + META_XPATH.parse("//meta:document-statistic/@meta:" + attribute); + ContentHandler branch = new MatchingContentHandler( + new AttributeMetadataHandler(META_NS, attribute, md, name), matcher); + return new TeeContentHandler(ch, branch); + } + + private static ContentHandler getStatistic( + ContentHandler ch, Metadata md, Property property, String attribute) { + Matcher matcher = + META_XPATH.parse("//meta:document-statistic/@meta:" + attribute); + ContentHandler branch = new MatchingContentHandler( + new AttributeMetadataHandler(META_NS, attribute, md, property), matcher); + return new TeeContentHandler(ch, branch); + } + + protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) { + // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date + // Process the Dublin Core Attributes + ch = new TeeContentHandler(super.getContentHandler(ch, md, context), + getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"), + getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"), + getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"), + getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"), + getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"), + getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"), + getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"), + getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"), + getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"), + getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights")); + + // Process the OO Meta Attributes + ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date"); + // ODF uses dc:date for modified + ch = new TeeContentHandler(ch, new ElementMetadataHandler( + DublinCore.NAMESPACE_URI_DC, "date", + md, TikaCoreProperties.MODIFIED)); + + // ODF uses dc:subject for description + ch = new TeeContentHandler(ch, new ElementMetadataHandler( + DublinCore.NAMESPACE_URI_DC, "subject", + md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT)); + ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword"); + + ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration"); + ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles"); + ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator"); + ch = getMeta(ch, md, Property.externalText("generator"), "generator"); + + // Process the user defined Meta Attributes + ch = getUserDefined(ch, md); + + // Process the OO Statistics Attributes + ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count"); + ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count"); + ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count"); + ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count"); + ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count"); + ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count"); + ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count"); + ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count"); + + // Legacy, Tika-1.0 style attributes + // TODO Remove these in Tika 2.0 + ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count"); + ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count"); + ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count"); + ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count"); + ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count"); + ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count"); + ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count"); + + // Legacy Statistics Attributes, replaced with real keys above + // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770) + ch = getStatistic(ch, md, "nbPage", "page-count"); + ch = getStatistic(ch, md, "nbPara", "paragraph-count"); + ch = getStatistic(ch, md, "nbWord", "word-count"); + ch = getStatistic(ch, md, "nbCharacter", "character-count"); + ch = getStatistic(ch, md, "nbTab", "table-count"); + ch = getStatistic(ch, md, "nbObject", "object-count"); + ch = getStatistic(ch, md, "nbImg", "image-count"); + + // Normalise the rest + ch = new NSNormalizerContentHandler(ch); + return ch; + } + + @Override + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + super.parse(stream, handler, metadata, context); + // Copy subject to description for OO2 + String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT); + if (odfSubject != null && !odfSubject.equals("") && + (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) { + metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject); + } + } + +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java new file mode 100644 index 00000000..6ba5281f --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.EndDocumentShieldingContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashSet; +import java.util.Set; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; +import java.util.zip.ZipInputStream; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * OpenOffice parser + */ +public class OpenDocumentParser extends AbstractParser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = -6410276875438618287L; + + private static final Set SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet(Arrays.asList( + MediaType.application("vnd.sun.xml.writer"), + MediaType.application("vnd.oasis.opendocument.text"), + MediaType.application("vnd.oasis.opendocument.graphics"), + MediaType.application("vnd.oasis.opendocument.presentation"), + MediaType.application("vnd.oasis.opendocument.spreadsheet"), + MediaType.application("vnd.oasis.opendocument.chart"), + MediaType.application("vnd.oasis.opendocument.image"), + MediaType.application("vnd.oasis.opendocument.formula"), + MediaType.application("vnd.oasis.opendocument.text-master"), + MediaType.application("vnd.oasis.opendocument.text-web"), + MediaType.application("vnd.oasis.opendocument.text-template"), + MediaType.application("vnd.oasis.opendocument.graphics-template"), + MediaType.application("vnd.oasis.opendocument.presentation-template"), + MediaType.application("vnd.oasis.opendocument.spreadsheet-template"), + MediaType.application("vnd.oasis.opendocument.chart-template"), + MediaType.application("vnd.oasis.opendocument.image-template"), + MediaType.application("vnd.oasis.opendocument.formula-template"), + MediaType.application("x-vnd.oasis.opendocument.text"), + MediaType.application("x-vnd.oasis.opendocument.graphics"), + MediaType.application("x-vnd.oasis.opendocument.presentation"), + MediaType.application("x-vnd.oasis.opendocument.spreadsheet"), + MediaType.application("x-vnd.oasis.opendocument.chart"), + MediaType.application("x-vnd.oasis.opendocument.image"), + MediaType.application("x-vnd.oasis.opendocument.formula"), + MediaType.application("x-vnd.oasis.opendocument.text-master"), + MediaType.application("x-vnd.oasis.opendocument.text-web"), + MediaType.application("x-vnd.oasis.opendocument.text-template"), + MediaType.application("x-vnd.oasis.opendocument.graphics-template"), + MediaType.application("x-vnd.oasis.opendocument.presentation-template"), + MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"), + MediaType.application("x-vnd.oasis.opendocument.chart-template"), + MediaType.application("x-vnd.oasis.opendocument.image-template"), + MediaType.application("x-vnd.oasis.opendocument.formula-template")))); + + private static final String META_NAME = "meta.xml"; + + private Parser meta = new OpenDocumentMetaParser(); + + private Parser content = new OpenDocumentContentParser(); + + public Parser getMetaParser() { + return meta; + } + + public void setMetaParser(Parser meta) { + this.meta = meta; + } + + public Parser getContentParser() { + return content; + } + + public void setContentParser(Parser content) { + this.content = content; + } + + public Set getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler baseHandler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + // Open the Zip stream + // Use a File if we can, and an already open zip is even better + ZipFile zipFile = null; + ZipInputStream zipStream = null; + if (stream instanceof TikaInputStream) { + TikaInputStream tis = (TikaInputStream) stream; + Object container = ((TikaInputStream) stream).getOpenContainer(); + if (container instanceof ZipFile) { + zipFile = (ZipFile) container; + } else if (tis.hasFile()) { + zipFile = new ZipFile(tis.getFile()); + } else { + zipStream = new ZipInputStream(stream); + } + } else { + zipStream = new ZipInputStream(stream); + } + + // Prepare to handle the content + XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata); + + // As we don't know which of the metadata or the content + // we'll hit first, catch the endDocument call initially + EndDocumentShieldingContentHandler handler = + new EndDocumentShieldingContentHandler(xhtml); + + if (zipFile != null) { + try { + handleZipFile(zipFile, metadata, context, handler); + } finally { + //Do we want to close silently == catch an exception here? + zipFile.close(); + } + } else { + try { + handleZipStream(zipStream, metadata, context, handler); + } finally { + //Do we want to close silently == catch an exception here? + zipStream.close(); + } + } + + // Only now call the end document + if (handler.getEndDocumentWasCalled()) { + handler.reallyEndDocument(); + } + } + + private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException { + ZipEntry entry = zipStream.getNextEntry(); + if (entry == null) { + throw new IOException("No entries found in ZipInputStream"); + } + do { + handleZipEntry(entry, zipStream, metadata, context, handler); + entry = zipStream.getNextEntry(); + } while (entry != null); + } + + private void handleZipFile(ZipFile zipFile, Metadata metadata, + ParseContext context, EndDocumentShieldingContentHandler handler) + throws IOException, TikaException, SAXException { + // If we can, process the metadata first, then the + // rest of the file afterwards (TIKA-1353) + // Only possible to guarantee that when opened from a file not a stream + + ZipEntry entry = zipFile.getEntry(META_NAME); + if (entry != null) { + handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler); + } + + Enumeration entries = zipFile.entries(); + while (entries.hasMoreElements()) { + entry = entries.nextElement(); + if (!META_NAME.equals(entry.getName())) { + handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler); + } + } + } + private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, + ParseContext context, EndDocumentShieldingContentHandler handler) + throws IOException, SAXException, TikaException { + if (entry == null) return; + + if (entry.getName().equals("mimetype")) { + String type = IOUtils.toString(zip, UTF_8); + metadata.set(Metadata.CONTENT_TYPE, type); + } else if (entry.getName().equals(META_NAME)) { + meta.parse(zip, new DefaultHandler(), metadata, context); + } else if (entry.getName().endsWith("content.xml")) { + if (content instanceof OpenDocumentContentParser) { + ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); + } else { + // Foreign content parser was set: + content.parse(zip, handler, metadata, context); + } + } else if (entry.getName().endsWith("styles.xml")) { + if (content instanceof OpenDocumentContentParser) { + ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); + } else { + // Foreign content parser was set: + content.parse(zip, handler, metadata, context); + } + } else { + String embeddedName = entry.getName(); + //scrape everything under Thumbnails/ and Pictures/ + if (embeddedName.contains("Thumbnails/") || + embeddedName.contains("Pictures/")) { + EmbeddedDocumentExtractor embeddedDocumentExtractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + Metadata embeddedMetadata = new Metadata(); + embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName()); + /* if (embeddedName.startsWith("Thumbnails/")) { + embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.THUMBNAIL); + }*/ + if (embeddedName.contains("Pictures/")) { + embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); + } + if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { + embeddedDocumentExtractor.parseEmbedded(zip, + new EmbeddedContentHandler(handler), embeddedMetadata, false); + } + } + + } + } +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java new file mode 100644 index 00000000..cbff35e7 --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AbstractMetadataHandler.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.xml.sax.helpers.DefaultHandler; + +import java.util.Arrays; +import java.util.List; + +/** + * Base class for SAX handlers that map SAX events into document metadata. + * + * @since Apache Tika 0.10 + */ +class AbstractMetadataHandler extends DefaultHandler { + + private final Metadata metadata; + private final Property property; + private final String name; + + protected AbstractMetadataHandler(Metadata metadata, String name) { + this.metadata = metadata; + this.property = null; + this.name = name; + } + protected AbstractMetadataHandler(Metadata metadata, Property property) { + this.metadata = metadata; + this.property = property; + this.name = property.getName(); + } + + /** + * Adds the given metadata value. The value is ignored if it is + * null or empty. If the metadata entry already exists, + * then the given value is appended to it with a comma as the separator. + * + * @param value metadata value + */ + protected void addMetadata(String value) { + if (value != null && value.length() > 0) { + if (metadata.isMultiValued(name)) { + // Add the value, assuming it's not already there + List previous = Arrays.asList(metadata.getValues(name)); + if (!previous.contains(value)) { + if (property != null) { + metadata.add(property, value); + } else { + metadata.add(name, value); + } + } + } else { + // Set the value, assuming it's not already there + String previous = metadata.get(name); + if (previous != null && previous.length() > 0) { + if (!previous.equals(value)) { + if (property != null) { + if (property.isMultiValuePermitted()) { + metadata.add(property, value); + } else { + // Replace the existing value if isMultiValuePermitted is false + metadata.set(property, value); + } + } else { + metadata.add(name, value); + } + } + } else { + if (property != null) { + metadata.set(property, value); + } else { + metadata.set(name, value); + } + } + } + } + } +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java new file mode 100644 index 00000000..c1795fad --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.Metadata; +import org.xml.sax.Attributes; +import org.xml.sax.helpers.DefaultHandler; + +/** + * This adds a Metadata entry for a given node. + * The textual content of the node is used as the + * value, and the Metadata name is taken from + * an attribute, with a prefix if required. + */ +public class AttributeDependantMetadataHandler extends DefaultHandler { + + private final Metadata metadata; + + private final String nameHoldingAttribute; + private final String namePrefix; + private String name; + + private final StringBuilder buffer = new StringBuilder(); + + public AttributeDependantMetadataHandler(Metadata metadata, String nameHoldingAttribute, String namePrefix) { + this.metadata = metadata; + this.nameHoldingAttribute = nameHoldingAttribute; + this.namePrefix = namePrefix; + } + + public void addMetadata(String value) { + if(name == null || name.length() == 0) { + // We didn't find the attribute which holds the name + return; + } + if (value.length() > 0) { + String previous = metadata.get(name); + if (previous != null && previous.length() > 0) { + value = previous + ", " + value; + } + metadata.set(name, value); + } + } + + public void endElement(String uri, String localName, String name) { + addMetadata(buffer.toString()); + buffer.setLength(0); + } + + public void startElement( + String uri, String localName, String name, Attributes attributes) { + String rawName = attributes.getValue(nameHoldingAttribute); + if (rawName != null) { + if (namePrefix == null) { + this.name = rawName; + } else { + this.name = namePrefix + rawName; + } + } + // All other attributes are ignored + } + + + public void characters(char[] ch, int start, int length) { + buffer.append(ch, start, length); + } + +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java new file mode 100644 index 00000000..dba5e4cb --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +/** + * SAX event handler that maps the contents of an XML attribute into + * a metadata field. + * + * @since Apache Tika 0.10 + */ +public class AttributeMetadataHandler extends AbstractMetadataHandler { + + private final String uri; + + private final String localName; + + public AttributeMetadataHandler( + String uri, String localName, Metadata metadata, String name) { + super(metadata, name); + this.uri = uri; + this.localName = localName; + } + public AttributeMetadataHandler( + String uri, String localName, Metadata metadata, Property property) { + super(metadata, property); + this.uri = uri; + this.localName = localName; + } + + @Override + public void startElement( + String uri, String localName, String qName, Attributes attributes) + throws SAXException { + for (int i = 0; i < attributes.getLength(); i++) { + if (attributes.getURI(i).equals(this.uri) + && attributes.getLocalName(i).equals(this.localName)) { + addMetadata(attributes.getValue(i).trim()); + } + } + } + +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java new file mode 100644 index 00000000..5999773e --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.DublinCore; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.TeeContentHandler; +import org.xml.sax.ContentHandler; + +/** + * Dublin Core metadata parser + */ +public class DcXMLParser extends XMLParser { + + /** Serial version UID */ + private static final long serialVersionUID = 4905318835463880819L; + + private static ContentHandler getDublinCoreHandler( + Metadata metadata, Property property, String element) { + return new ElementMetadataHandler( + DublinCore.NAMESPACE_URI_DC, element, + metadata, property); + } + + protected ContentHandler getContentHandler( + ContentHandler handler, Metadata metadata, ParseContext context) { + return new TeeContentHandler( + super.getContentHandler(handler, metadata, context), + getDublinCoreHandler(metadata, TikaCoreProperties.TITLE, "title"), + getDublinCoreHandler(metadata, TikaCoreProperties.KEYWORDS, "subject"), + getDublinCoreHandler(metadata, TikaCoreProperties.CREATOR, "creator"), + getDublinCoreHandler(metadata, TikaCoreProperties.DESCRIPTION, "description"), + getDublinCoreHandler(metadata, TikaCoreProperties.PUBLISHER, "publisher"), + getDublinCoreHandler(metadata, TikaCoreProperties.CONTRIBUTOR, "contributor"), + getDublinCoreHandler(metadata, TikaCoreProperties.CREATED, "date"), + getDublinCoreHandler(metadata, TikaCoreProperties.TYPE, "type"), + getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, "format"), + getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, "identifier"), + getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, "language"), + getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, "rights")); + } + +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java new file mode 100644 index 00000000..d7a81dc4 --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/ElementMetadataHandler.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.Attributes; + +import java.util.Arrays; + +/** + * SAX event handler that maps the contents of an XML element into + * a metadata field. + * + * @since Apache Tika 0.10 + */ +public class ElementMetadataHandler extends AbstractMetadataHandler { + private static final Logger LOG = LoggerFactory.getLogger(ElementMetadataHandler.class); + + private static final String LOCAL_NAME_RDF_BAG = "Bag"; + private static final String LOCAL_NAME_RDF_LI = "li"; + private static final String URI_RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + + private final String uri; + + private final String localName; + + private final Metadata metadata; + + private final String name; + private Property targetProperty; + + private final boolean allowDuplicateValues; + private final boolean allowEmptyValues; + + /** + * The buffer used to capture characters when inside a bag li element. + */ + private final StringBuilder bufferBagged = new StringBuilder(); + + /** + * The buffer used to capture characters inside standard elements. + */ + private final StringBuilder bufferBagless = new StringBuilder(); + + /** + * Whether or not the value was found in a standard element structure or inside a bag. + */ + private boolean isBagless = true; + + private int matchLevel = 0; + private int parentMatchLevel = 0; + + /** + * Constructor for string metadata keys. + * + * @param uri the uri of the namespace of the element + * @param localName the local name of the element + * @param metadata the Tika metadata object to populate + * @param name the Tika metadata field key + */ + public ElementMetadataHandler( + String uri, String localName, Metadata metadata, String name) { + super(metadata, name); + this.uri = uri; + this.localName = localName; + this.metadata = metadata; + this.name = name; + this.allowDuplicateValues = false; + this.allowEmptyValues = false; + LOG.trace("created simple handler for {}", this.name); + } + + /** + * Constructor for string metadata keys which allows change of behavior + * for duplicate and empty entry values. + * + * @param uri the uri of the namespace of the element + * @param localName the local name of the element + * @param metadata the Tika metadata object to populate + * @param name the Tika metadata field key + * @param allowDuplicateValues add duplicate values to the Tika metadata + * @param allowEmptyValues add empty values to the Tika metadata + */ + public ElementMetadataHandler( + String uri, String localName, Metadata metadata, String name, boolean allowDuplicateValues, boolean allowEmptyValues) { + super(metadata, name); + this.uri = uri; + this.localName = localName; + this.metadata = metadata; + this.name = name; + this.allowDuplicateValues = allowDuplicateValues; + this.allowEmptyValues = allowEmptyValues; + LOG.trace("created simple handler for {}", this.name); + } + + /** + * Constructor for Property metadata keys. + * + * @param uri the uri of the namespace of the element + * @param localName the local name of the element + * @param metadata the Tika metadata object to populate + * @param targetProperty the Tika metadata Property key + */ + public ElementMetadataHandler( + String uri, String localName, Metadata metadata, Property targetProperty) { + super(metadata, targetProperty); + this.uri = uri; + this.localName = localName; + this.metadata = metadata; + this.targetProperty = targetProperty; + this.name = targetProperty.getName(); + this.allowDuplicateValues = false; + this.allowEmptyValues = false; + LOG.trace("created property handler for {}", this.name); + } + + /** + * Constructor for Property metadata keys which allows change of behavior + * for duplicate and empty entry values. + * + * @param uri the uri of the namespace of the element + * @param localName the local name of the element + * @param metadata the Tika metadata object to populate + * @param targetProperty the Tika metadata Property key + * @param allowDuplicateValues add duplicate values to the Tika metadata + * @param allowEmptyValues add empty values to the Tika metadata + */ + public ElementMetadataHandler( + String uri, String localName, Metadata metadata, Property targetProperty, boolean allowDuplicateValues, boolean allowEmptyValues) { + super(metadata, targetProperty); + this.uri = uri; + this.localName = localName; + this.metadata = metadata; + this.targetProperty = targetProperty; + this.name = targetProperty.getName(); + this.allowDuplicateValues = allowDuplicateValues; + this.allowEmptyValues = allowEmptyValues; + LOG.trace("created property handler for {}", this.name); + } + + protected boolean isMatchingParentElement(String uri, String localName) { + return (uri.equals(this.uri) && localName.equals(this.localName)); + } + + protected boolean isMatchingElement(String uri, String localName) { + // match if we're inside the parent element or within some bag element + return (uri.equals(this.uri) && localName.equals(this.localName)) || + (parentMatchLevel > 0 && + ((uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_BAG)) || + (uri.equals(URI_RDF) && localName.equals(LOCAL_NAME_RDF_LI)) + ) + ); + } + + @Override + public void startElement( + String uri, String localName, String name, Attributes attributes) { + if (isMatchingElement(uri, localName)) { + matchLevel++; + } + if (isMatchingParentElement(uri, localName)) { + parentMatchLevel++; + } + } + + @Override + public void endElement(String uri, String localName, String name) { + if (isMatchingParentElement(uri, localName)) { + parentMatchLevel--; + } + if (isMatchingElement(uri, localName)) { + matchLevel--; + if (matchLevel == 2) { + // we're inside a bag li element, add the bagged buffer + addMetadata(bufferBagged.toString().trim()); + bufferBagged.setLength(0); + isBagless = false; + } + if (matchLevel == 0 && isBagless) { + String valueBagless = bufferBagless.toString(); + if (valueBagless.length() > 0 && !valueBagless.contains(LOCAL_NAME_RDF_BAG)) { + // we're in a standard element, add the bagless buffer + addMetadata(valueBagless.trim()); + bufferBagless.setLength(0); + } + isBagless = true; + } + } + } + + @Override + public void characters(char[] ch, int start, int length) { + // We need to append to both buffers since we don't if we're inside a bag until we're done + if (parentMatchLevel > 0 && matchLevel > 2) { + bufferBagged.append(ch, start, length); + } + if (parentMatchLevel > 0 && matchLevel > 0) { + bufferBagless.append(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) { + characters(ch, start, length); + } + + @Override + protected void addMetadata(String value) { + LOG.trace("adding {}={}", name, value); + if (targetProperty != null && targetProperty.isMultiValuePermitted()) { + if ((value != null && value.length() > 0) || allowEmptyValues) { + if (value == null || value.length() == 0 && allowEmptyValues) { + value = ""; + } + String[] previous = metadata.getValues(name); + if (previous == null || !Arrays.asList(previous).contains(value) || allowDuplicateValues) { + metadata.add(targetProperty, value); + } + } + } else { + super.addMetadata(value); + } + } +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java new file mode 100644 index 00000000..1f396901 --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.commons.codec.binary.Base64; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Collections; +import java.util.Set; + +public class FictionBookParser extends XMLParser { + private static final long serialVersionUID = 4195954546491524374L; + + private static final Set SUPPORTED_TYPES = + Collections.singleton(MediaType.application("x-fictionbook+xml")); + @Override + public Set getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + protected ContentHandler getContentHandler(ContentHandler handler, Metadata metadata, ParseContext context) { + return new BinaryElementsDataHandler( + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context), handler); + } + + private static class BinaryElementsDataHandler extends DefaultHandler { + private static final String ELEMENT_BINARY = "binary"; + + private boolean binaryMode = false; + private static final String ATTRIBUTE_ID = "id"; + + private final EmbeddedDocumentExtractor partExtractor; + private final ContentHandler handler; + private final StringBuilder binaryData = new StringBuilder(); + private Metadata metadata; + private static final String ATTRIBUTE_CONTENT_TYPE = "content-type"; + + private BinaryElementsDataHandler(EmbeddedDocumentExtractor partExtractor, ContentHandler handler) { + this.partExtractor = partExtractor; + this.handler = handler; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { + binaryMode = ELEMENT_BINARY.equals(localName); + if (binaryMode) { + binaryData.setLength(0); + metadata = new Metadata(); + + metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, attributes.getValue(ATTRIBUTE_ID)); + metadata.set(Metadata.CONTENT_TYPE, attributes.getValue(ATTRIBUTE_CONTENT_TYPE)); + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (binaryMode) { + try { + partExtractor.parseEmbedded( + new ByteArrayInputStream(Base64.decodeBase64(binaryData.toString())), + handler, + metadata, + true + ); + } catch (IOException e) { + throw new SAXException("IOException in parseEmbedded", e); + } + + binaryMode = false; + binaryData.setLength(0); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (!binaryMode) { + handler.characters(ch, start, length); + } else { + binaryData.append(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + handler.ignorableWhitespace(ch, start, length); + } + } +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java new file mode 100644 index 00000000..3fee00a3 --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/MetadataHandler.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.xml.sax.Attributes; +import org.xml.sax.helpers.DefaultHandler; + +/** + * This adds Metadata entries with a specified name for + * the textual content of a node (if present), and + * all attribute values passed through the matcher + * (but not their names). + * + * @deprecated Use the {@link AttributeMetadataHandler} and + * {@link ElementMetadataHandler} classes instead + */ +public class MetadataHandler extends DefaultHandler { + + private final Metadata metadata; + + private final Property property; + private final String name; + + private final StringBuilder buffer = new StringBuilder(); + + public MetadataHandler(Metadata metadata, String name) { + this.metadata = metadata; + this.property = null; + this.name = name; + } + public MetadataHandler(Metadata metadata, Property property) { + this.metadata = metadata; + this.property = property; + this.name = property.getName(); + } + + public void addMetadata(String value) { + if (value.length() > 0) { + String previous = metadata.get(name); + if (previous != null && previous.length() > 0) { + value = previous + ", " + value; + } + + if (this.property != null) { + metadata.set(property, value); + } else { + metadata.set(name, value); + } + } + } + + public void endElement(String uri, String localName, String name) { + addMetadata(buffer.toString()); + buffer.setLength(0); + } + + public void startElement( + String uri, String localName, String name, Attributes attributes) { + for (int i = 0; i < attributes.getLength(); i++) { + addMetadata(attributes.getValue(i)); + } + } + + + public void characters(char[] ch, int start, int length) { + buffer.append(ch, start, length); + } + +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java b/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java new file mode 100644 index 00000000..e247a6c4 --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/xml/XMLParser.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xml; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.TaggedContentHandler; +import org.apache.tika.sax.TextContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.XMLReaderUtils; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** + * XML parser. + */ +public class XMLParser extends AbstractParser { + + /** Serial version UID */ + private static final long serialVersionUID = -6028836725280212837L; + + private static final Set SUPPORTED_TYPES = + Collections.unmodifiableSet(new HashSet(Arrays.asList( + MediaType.application("xml"), + MediaType.image("svg+xml")))); + + public Set getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + if (metadata.get(Metadata.CONTENT_TYPE) == null) { + metadata.set(Metadata.CONTENT_TYPE, "application/xml"); + } + + final XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.startElement("p"); + + TaggedContentHandler tagged = new TaggedContentHandler(handler); + try { + XMLReaderUtils.parseSAX( + new CloseShieldInputStream(stream), + new OfflineContentHandler(new EmbeddedContentHandler( + getContentHandler(tagged, metadata, context))), context); + } catch (SAXException e) { + tagged.throwIfCauseOf(e); + throw new TikaException("XML parse error", e); + } finally { + xhtml.endElement("p"); + xhtml.endDocument(); + } + } + + protected ContentHandler getContentHandler( + ContentHandler handler, Metadata metadata, ParseContext context) { + return new TextContentHandler(handler, true); + } +} diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala new file mode 100644 index 00000000..b4951686 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/ExtractConfig.scala @@ -0,0 +1,5 @@ +package docspell.extract + +import docspell.extract.ocr.OcrConfig + +case class ExtractConfig(ocr: OcrConfig, pdf: PdfConfig) diff --git a/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala new file mode 100644 index 00000000..ee948c53 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/ExtractResult.scala @@ -0,0 +1,39 @@ +package docspell.extract + +import docspell.common.MimeType + +import scala.util.Try + +sealed trait ExtractResult { + + def textOption: Option[String] + +} + +object ExtractResult { + + case class UnsupportedFormat(mime: MimeType) extends ExtractResult { + val textOption = None + } + def unsupportedFormat(mt: MimeType): ExtractResult = + UnsupportedFormat(mt) + + case class Failure(ex: Throwable) extends ExtractResult { + val textOption = None + } + def failure(ex: Throwable): ExtractResult = + Failure(ex) + + case class Success(text: String) extends ExtractResult { + val textOption = Some(text) + } + def success(text: String): ExtractResult = + Success(text) + + def fromTry(r: Try[String]): ExtractResult = + r.fold(Failure.apply, Success.apply) + + def fromEither(e: Either[Throwable, String]): ExtractResult = + e.fold(failure, success) + +} diff --git a/modules/extract/src/main/scala/docspell/extract/Extraction.scala b/modules/extract/src/main/scala/docspell/extract/Extraction.scala new file mode 100644 index 00000000..02ca0502 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/Extraction.scala @@ -0,0 +1,88 @@ +package docspell.extract + +import cats.effect._ +import cats.implicits._ +import docspell.common._ +import docspell.extract.ocr.{OcrType, TextExtract} +import docspell.extract.odf.{OdfExtract, OdfType} +import docspell.extract.poi.{PoiExtract, PoiType} +import docspell.extract.rtf.RtfExtract +import fs2.Stream +import docspell.files.TikaMimetype +import docspell.files.ImageSize + +trait Extraction[F[_]] { + + def extractText(data: Stream[F, Byte], dataType: DataType, lang: Language): F[ExtractResult] + +} + +object Extraction { + + def create[F[_]: Sync: ContextShift]( + blocker: Blocker, + logger: Logger[F], + cfg: ExtractConfig + ): Extraction[F] = + new Extraction[F] { + def extractText( + data: Stream[F, Byte], + dataType: DataType, + lang: Language + ): F[ExtractResult] = { + TikaMimetype.resolve(dataType, data).flatMap { + case MimeType.pdf => + PdfExtract + .get(data, blocker, lang, cfg.pdf.minTextLen, cfg.ocr, logger) + .map(ExtractResult.fromEither) + + case PoiType(mt) => + PoiExtract.get(data, mt).map(ExtractResult.fromEither) + + case RtfExtract.rtfType => + RtfExtract.get(data).map(ExtractResult.fromEither) + + case OdfType(_) => + OdfExtract.get(data).map(ExtractResult.fromEither) + + case OcrType(mt) => + val doExtract = TextExtract + .extractOCR(data, blocker, logger, lang.iso3, cfg.ocr) + .compile + .lastOrError + .attempt + .map(ExtractResult.fromEither) + + ImageSize.get(data).flatMap { + case Some(dim) => + if (dim.product > cfg.ocr.maxImageSize) { + logger.info(s"Image size (${dim.product}) is too large (max ${cfg.ocr.maxImageSize}).") *> + ExtractResult.failure(new Exception( + s"Image size (${dim.width}x${dim.height}) is too large (max ${cfg.ocr.maxImageSize}).") + ).pure[F] + } else { + doExtract + } + case None => + logger.info(s"Cannot read image data from ${mt.asString}. Extracting anyways.") *> + doExtract + } + + case OdfType.container => + logger.info(s"File detected as ${OdfType.container}. Try to read as OpenDocument file.") *> + OdfExtract.get(data).map(ExtractResult.fromEither) + + case mt@MimeType("text", sub) if !sub.contains("html") => + logger.info(s"File detected as ${mt.asString}. Returning itself as text.") *> + data.through(fs2.text.utf8Decode).compile.last.map { txt => + ExtractResult.success(txt.getOrElse("").trim) + } + + case mt => + ExtractResult.unsupportedFormat(mt).pure[F] + + } + } + } + +} diff --git a/modules/extract/src/main/scala/docspell/extract/PdfConfig.scala b/modules/extract/src/main/scala/docspell/extract/PdfConfig.scala new file mode 100644 index 00000000..7d4476f8 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/PdfConfig.scala @@ -0,0 +1,3 @@ +package docspell.extract + +case class PdfConfig (minTextLen: Int) diff --git a/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala new file mode 100644 index 00000000..51c1fbcb --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/PdfExtract.scala @@ -0,0 +1,51 @@ +package docspell.extract + +import cats.implicits._ +import cats.effect._ +import fs2.Stream +import docspell.common.{Language, Logger} +import docspell.extract.ocr.{OcrConfig, TextExtract} +import docspell.extract.pdfbox.PdfboxExtract + +object PdfExtract { + + def get[F[_]: Sync: ContextShift]( + in: Stream[F, Byte], + blocker: Blocker, + lang: Language, + stripMinLen: Int, + ocrCfg: OcrConfig, + logger: Logger[F] + ): F[Either[Throwable, String]] = { + + val runOcr = + TextExtract.extractOCR(in, blocker, logger, lang.iso3, ocrCfg).compile.lastOrError + + def chooseResult(ocrStr: String, strippedStr: String) = + if (ocrStr.length > strippedStr.length) + logger.info( + s"Using OCR text, as it is longer (${ocrStr.length} > ${strippedStr.length})" + ) *> ocrStr.pure[F] + else + logger.info( + s"Using stripped text (not OCR), as it is longer (${strippedStr.length} > ${ocrStr.length})" + ) *> strippedStr.pure[F] + + //maybe better: inspect the pdf and decide whether ocr or not + for { + pdfboxRes <- logger.debug("Trying to strip text from pdf using pdfbox.") *> PdfboxExtract.get[F](in) + res <- pdfboxRes.fold( + ex => + logger.info( + s"Stripping text from PDF resulted in an error: ${ex.getMessage}. Trying with OCR. " + ) >> runOcr.attempt, + str => + if (str.length >= stripMinLen) str.pure[F].attempt + else + logger + .info(s"Stripped text from PDF is small (${str.length}). Trying with OCR.") *> + runOcr.flatMap(ocrStr => chooseResult(ocrStr, str)).attempt + ) + } yield res + } +} diff --git a/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala similarity index 55% rename from modules/text/src/main/scala/docspell/text/ocr/Ocr.scala rename to modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala index 99f558d3..ff30710c 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/Ocr.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/Ocr.scala @@ -1,28 +1,28 @@ -package docspell.text.ocr +package docspell.extract.ocr import java.nio.file.Path import cats.effect.{Blocker, ContextShift, Sync} import fs2.Stream -import org.log4s._ +import docspell.common._ object Ocr { - private[this] val logger = getLogger /** Extract the text of all pages in the given pdf file. */ def extractPdf[F[_]: Sync: ContextShift]( pdf: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, - config: Config - ): Stream[F, String] = - File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => - runGhostscript(pdf, config, wd, blocker) - .flatMap({ tmpImg => - runTesseractFile(tmpImg, blocker, lang, config) - }) + config: OcrConfig + ): F[Option[String]] = + File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => + runGhostscript(pdf, config, wd, blocker, logger) + .flatMap(tmpImg => runTesseractFile(tmpImg, blocker, logger, lang, config)) .fold1(_ + "\n\n\n" + _) + .compile + .last } /** Extract the text from the given image file @@ -30,41 +30,45 @@ object Ocr { def extractImage[F[_]: Sync: ContextShift]( img: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = - runTesseractStdin(img, blocker, lang, config) + runTesseractStdin(img, blocker, logger, lang, config) def extractPdFFile[F[_]: Sync: ContextShift]( pdf: Path, blocker: Blocker, + logger: Logger[F], lang: String, - config: Config - ): Stream[F, String] = - File.withTempDir(config.ghostscript.workingDir, "extractpdf") { wd => - runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker) - .flatMap({ tif => - runTesseractFile(tif, blocker, lang, config) - }) + config: OcrConfig + ): F[Option[String]] = + File.withTempDir(config.ghostscript.workingDir, "extractpdf").use { wd => + runGhostscriptFile(pdf, config.ghostscript.command, wd, blocker, logger) + .flatMap(tif => runTesseractFile(tif, blocker, logger, lang, config)) .fold1(_ + "\n\n\n" + _) + .compile + .last } def extractImageFile[F[_]: Sync: ContextShift]( img: Path, blocker: Blocker, + logger: Logger[F], lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = - runTesseractFile(img, blocker, lang, config) + runTesseractFile(img, blocker, logger, lang, config) /** Run ghostscript to extract all pdf pages into tiff files. The * files are stored to a temporary location on disk and returned. */ - private[text] def runGhostscript[F[_]: Sync: ContextShift]( + private[extract] def runGhostscript[F[_]: Sync: ContextShift]( pdf: Stream[F, Byte], - cfg: Config, + cfg: OcrConfig, wd: Path, - blocker: Blocker + blocker: Blocker, + logger: Logger[F] ): Stream[F, Path] = { val xargs = if (cfg.pageRange.begin > 0) @@ -72,44 +76,37 @@ object Ocr { else cfg.ghostscript.command.args val cmd = cfg.ghostscript.command .copy(args = xargs) - .mapArgs( - replace( - Map( - "{{infile}}" -> "-", - "{{outfile}}" -> "%d.tif" - ) + .replace( + Map( + "{{infile}}" -> "-", + "{{outfile}}" -> "%d.tif" ) ) SystemCommand - .execSuccess(cmd, blocker, wd = Some(wd), stdin = pdf) - .evalMap({ _ => - File.listFiles(pathEndsWith(".tif"), wd) - }) + .execSuccess(cmd, blocker, logger, wd = Some(wd), stdin = pdf) + .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd)) .flatMap(fs => Stream.emits(fs)) } /** Run ghostscript to extract all pdf pages into tiff files. The * files are stored to a temporary location on disk and returned. */ - private[text] def runGhostscriptFile[F[_]: Sync: ContextShift]( + private[extract] def runGhostscriptFile[F[_]: Sync: ContextShift]( pdf: Path, - ghostscript: Config.Command, + ghostscript: SystemCommand.Config, wd: Path, - blocker: Blocker + blocker: Blocker, + logger: Logger[F] ): Stream[F, Path] = { - val cmd = ghostscript.mapArgs( - replace( - Map( - "{{infile}}" -> pdf.toAbsolutePath.toString, - "{{outfile}}" -> "%d.tif" - ) + val cmd = ghostscript.replace( + Map( + "{{infile}}" -> pdf.toAbsolutePath.toString, + "{{outfile}}" -> "%d.tif" ) ) SystemCommand - .execSuccess[F](cmd, blocker, wd = Some(wd)) - .evalMap({ _ => - File.listFiles(pathEndsWith(".tif"), wd) - }) + .execSuccess[F](cmd, blocker, logger, wd = Some(wd)) + .evalMap(_ => File.listFiles(pathEndsWith(".tif"), wd)) .flatMap(fs => Stream.emits(fs)) } @@ -119,68 +116,63 @@ object Ocr { /** Run unpaper to optimize the image for ocr. The * files are stored to a temporary location on disk and returned. */ - private[text] def runUnpaperFile[F[_]: Sync: ContextShift]( + private[extract] def runUnpaperFile[F[_]: Sync: ContextShift]( img: Path, - unpaper: Config.Command, + unpaper: SystemCommand.Config, wd: Path, - blocker: Blocker + blocker: Blocker, + logger: Logger[F] ): Stream[F, Path] = { val targetFile = img.resolveSibling("u-" + img.getFileName.toString).toAbsolutePath - val cmd = unpaper.mapArgs( - replace( - Map( - "{{infile}}" -> img.toAbsolutePath.toString, - "{{outfile}}" -> targetFile.toString - ) + val cmd = unpaper.replace( + Map( + "{{infile}}" -> img.toAbsolutePath.toString, + "{{outfile}}" -> targetFile.toString ) ) - SystemCommand.execSuccess[F](cmd, blocker, wd = Some(wd)).map(_ => targetFile).handleErrorWith { - th => + SystemCommand + .execSuccess[F](cmd, blocker, logger, wd = Some(wd)) + .map(_ => targetFile) + .handleErrorWith { th => logger .warn(s"Unpaper command failed: ${th.getMessage}. Using input file for text extraction.") Stream.emit(img) - } + } } /** Run tesseract on the given image file and return the extracted * text. */ - private[text] def runTesseractFile[F[_]: Sync: ContextShift]( + private[extract] def runTesseractFile[F[_]: Sync: ContextShift]( img: Path, blocker: Blocker, + logger: Logger[F], lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = // tesseract cannot cope with absolute filenames // so use the parent as working dir - runUnpaperFile(img, config.unpaper.command, img.getParent, blocker).flatMap { uimg => - val cmd = config.tesseract.command.mapArgs( - replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) - ) - SystemCommand.execSuccess[F](cmd, blocker, wd = Some(uimg.getParent)).map(_.stdout) + runUnpaperFile(img, config.unpaper.command, img.getParent, blocker, logger).flatMap { uimg => + val cmd = config.tesseract.command + .replace(Map("{{file}}" -> uimg.getFileName.toString, "{{lang}}" -> fixLanguage(lang))) + SystemCommand.execSuccess[F](cmd, blocker, logger, wd = Some(uimg.getParent)).map(_.stdout) } /** Run tesseract on the given image file and return the extracted * text. */ - private[text] def runTesseractStdin[F[_]: Sync: ContextShift]( + private[extract] def runTesseractStdin[F[_]: Sync: ContextShift]( img: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = { val cmd = config.tesseract.command - .mapArgs(replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang)))) - SystemCommand.execSuccess(cmd, blocker, stdin = img).map(_.stdout) + .replace(Map("{{file}}" -> "stdin", "{{lang}}" -> fixLanguage(lang))) + SystemCommand.execSuccess(cmd, blocker, logger, stdin = img).map(_.stdout) } - private def replace(repl: Map[String, String]): String => String = - s => - repl.foldLeft(s) { - case (res, (k, v)) => - res.replace(k, v) - } - private def fixLanguage(lang: String): String = lang match { case "de" => "deu" diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala new file mode 100644 index 00000000..739b0149 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrConfig.scala @@ -0,0 +1,52 @@ +package docspell.extract.ocr + +import java.nio.file.{Path, Paths} + +import docspell.common._ + +case class OcrConfig( + maxImageSize: Int, + ghostscript: OcrConfig.Ghostscript, + pageRange: OcrConfig.PageRange, + unpaper: OcrConfig.Unpaper, + tesseract: OcrConfig.Tesseract +) { +} + +object OcrConfig { + + case class PageRange(begin: Int) + + case class Ghostscript(command: SystemCommand.Config, workingDir: Path) + + case class Tesseract(command: SystemCommand.Config) + + case class Unpaper(command: SystemCommand.Config) + + val default = OcrConfig( + maxImageSize = 3000 * 3000, + pageRange = PageRange(10), + ghostscript = Ghostscript( + SystemCommand.Config( + "gs", + Seq( + "-dNOPAUSE", + "-dBATCH", + "-dSAFER", + "-sDEVICE=tiffscaled8", + "-sOutputFile={{outfile}}", + "{{infile}}" + ), + Duration.seconds(30) + ), + Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction") + ), + unpaper = Unpaper( + SystemCommand.Config("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30)) + ), + tesseract = Tesseract( + SystemCommand + .Config("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1)) + ) + ) +} diff --git a/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala b/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala new file mode 100644 index 00000000..f2effac6 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/ocr/OcrType.scala @@ -0,0 +1,16 @@ +package docspell.extract.ocr + +import docspell.common.MimeType + +object OcrType { + + val jpeg = MimeType.jpeg + val png = MimeType.png + val tiff = MimeType.tiff + val pdf = MimeType.pdf + + val all = Set(jpeg, png, tiff, pdf) + + def unapply(mt: MimeType): Option[MimeType] = + Some(mt).filter(all.contains) +} diff --git a/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala similarity index 60% rename from modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala rename to modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala index 884a1581..7246bb7c 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/TextExtract.scala +++ b/modules/extract/src/main/scala/docspell/extract/ocr/TextExtract.scala @@ -1,7 +1,8 @@ -package docspell.text.ocr +package docspell.extract.ocr import cats.effect.{Blocker, ContextShift, Sync} -import docspell.common.MimeType +import docspell.common._ +import docspell.files._ import fs2.Stream object TextExtract { @@ -9,28 +10,27 @@ object TextExtract { def extract[F[_]: Sync: ContextShift]( in: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = - extractOCR(in, blocker, lang, config) + extractOCR(in, blocker, logger, lang, config) def extractOCR[F[_]: Sync: ContextShift]( in: Stream[F, Byte], blocker: Blocker, + logger: Logger[F], lang: String, - config: Config + config: OcrConfig ): Stream[F, String] = Stream - .eval(TikaMimetype.detect(in)) + .eval(TikaMimetype.detect(in, MimeTypeHint.none)) .flatMap({ - case mt if !config.isAllowed(mt) => - raiseError(s"File `$mt` not allowed") - case MimeType.pdf => - Ocr.extractPdf(in, blocker, lang, config) + Stream.eval(Ocr.extractPdf(in, blocker, logger, lang, config)).unNoneTerminate case mt if mt.primary == "image" => - Ocr.extractImage(in, blocker, lang, config) + Ocr.extractImage(in, blocker, logger, lang, config) case mt => raiseError(s"File `$mt` not supported") diff --git a/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala b/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala new file mode 100644 index 00000000..ae3ac66d --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfExtract.scala @@ -0,0 +1,30 @@ +package docspell.extract.odf + +import cats.effect._ +import cats.implicits._ +import fs2.Stream +import java.io.{ByteArrayInputStream, InputStream} + +import org.apache.tika.metadata.Metadata +import org.apache.tika.parser.ParseContext +import org.apache.tika.parser.odf.OpenDocumentParser +import org.apache.tika.sax.BodyContentHandler + +import scala.util.Try + +object OdfExtract { + + def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get) + + + def get(is: InputStream) = Try { + val handler = new BodyContentHandler() + val pctx = new ParseContext() + val meta = new Metadata() + val ooparser = new OpenDocumentParser() + ooparser.parse(is, handler, meta, pctx) + handler.toString.trim + }.toEither + +} diff --git a/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala new file mode 100644 index 00000000..3e935ef4 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/odf/OdfType.scala @@ -0,0 +1,18 @@ +package docspell.extract.odf + +import docspell.common.MimeType + +object OdfType { + + val odt = MimeType.application("vnd.oasis.opendocument.text") + val ods = MimeType.application("vnd.oasis.opendocument.spreadsheet") + val odtAlias = MimeType.application("x-vnd.oasis.opendocument.text") + val odsAlias = MimeType.application("x-vnd.oasis.opendocument.spreadsheet") + + val container = MimeType.zip + + val all = Set(odt, ods, odtAlias, odsAlias) + + def unapply(mt: MimeType): Option[MimeType] = + Some(mt).filter(all.contains) +} diff --git a/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala new file mode 100644 index 00000000..c935100c --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/pdfbox/PdfboxExtract.scala @@ -0,0 +1,34 @@ +package docspell.extract.pdfbox + +import java.io.InputStream +import java.nio.file.Path + +import cats.implicits._ +import cats.effect.Sync +import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.text.PDFTextStripper + +import scala.util.{Try, Using} +import fs2.Stream + +object PdfboxExtract { + + def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + data.compile.to(Array).map { bytes => + Using(PDDocument.load(bytes))(readText).toEither.flatten + } + + def get(is: InputStream): Either[Throwable, String] = + Using(PDDocument.load(is))(readText).toEither.flatten + + def get(inFile: Path): Either[Throwable, String] = + Using(PDDocument.load(inFile.toFile))(readText).toEither.flatten + + private def readText(doc: PDDocument): Either[Throwable, String] = + Try { + val stripper = new PDFTextStripper() + stripper.setAddMoreFormatting(true) + stripper.setLineSeparator("\n") + stripper.getText(doc).trim // trim here already + }.toEither +} diff --git a/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala new file mode 100644 index 00000000..48cd0638 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiExtract.scala @@ -0,0 +1,88 @@ +package docspell.extract.poi + +import java.io.{ByteArrayInputStream, InputStream} + +import cats.data.EitherT +import cats.implicits._ +import cats.effect.Sync +import org.apache.poi.hssf.extractor.ExcelExtractor +import org.apache.poi.hssf.usermodel.HSSFWorkbook +import org.apache.poi.hwpf.extractor.WordExtractor +import org.apache.poi.xssf.extractor.XSSFExcelExtractor +import org.apache.poi.xssf.usermodel.XSSFWorkbook +import org.apache.poi.xwpf.extractor.XWPFWordExtractor +import org.apache.poi.xwpf.usermodel.XWPFDocument +import fs2.Stream + +import scala.util.Try +import docspell.common._ +import docspell.files.TikaMimetype + +object PoiExtract { + + def get[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[Either[Throwable, String]] = + TikaMimetype.detect(data, hint).flatMap(mt => get(data, mt)) + + def get[F[_]: Sync](data: Stream[F, Byte], mime: MimeType): F[Either[Throwable, String]] = + mime match { + case PoiType.doc => + getDoc(data) + case PoiType.xls => + getXls(data) + case PoiType.xlsx => + getXlsx(data) + case PoiType.docx => + getDocx(data) + case PoiType.msoffice => + EitherT(getDoc[F](data)) + .recoverWith({ + case _ => EitherT(getXls[F](data)) + }) + .value + case PoiType.ooxml => + EitherT(getDocx[F](data)) + .recoverWith({ + case _ => EitherT(getXlsx[F](data)) + }) + .value + case mt => + Sync[F].pure(Left(new Exception(s"Unsupported content: ${mt.asString}"))) + } + + def getDocx(is: InputStream): Either[Throwable, String] = + Try { + val xt = new XWPFWordExtractor(new XWPFDocument(is)) + xt.getText.trim + }.toEither + + def getDoc(is: InputStream): Either[Throwable, String] = + Try { + val xt = new WordExtractor(is) + xt.getText.trim + }.toEither + + def getXlsx(is: InputStream): Either[Throwable, String] = + Try { + val xt = new XSSFExcelExtractor(new XSSFWorkbook(is)) + xt.getText.trim + }.toEither + + def getXls(is: InputStream): Either[Throwable, String] = + Try { + val xt = new ExcelExtractor(new HSSFWorkbook(is)) + xt.getText.trim + }.toEither + + def getDocx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDocx) + + def getDoc[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getDoc) + + def getXlsx[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXlsx) + + def getXls[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + data.compile.to(Array).map(new ByteArrayInputStream(_)).map(getXls) + +} diff --git a/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala new file mode 100644 index 00000000..f77cccb5 --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/poi/PoiType.scala @@ -0,0 +1,19 @@ +package docspell.extract.poi + +import docspell.common.MimeType + +object PoiType { + + val msoffice = MimeType.application("x-tika-msoffice") + val ooxml = MimeType.application("x-tika-ooxml") + val docx = MimeType.application("vnd.openxmlformats-officedocument.wordprocessingml.document") + val xlsx = MimeType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet") + val xls = MimeType.application("vnd.ms-excel") + val doc = MimeType.application("msword") + + val all = Set(msoffice, ooxml, docx, xlsx, xls, doc) + + def unapply(arg: MimeType): Option[MimeType] = + Some(arg).filter(all.contains) + +} diff --git a/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala new file mode 100644 index 00000000..c4a37fec --- /dev/null +++ b/modules/extract/src/main/scala/docspell/extract/rtf/RtfExtract.scala @@ -0,0 +1,27 @@ +package docspell.extract.rtf + +import java.io.{ByteArrayInputStream, InputStream} + +import cats.implicits._ +import cats.effect.Sync +import docspell.common.MimeType +import fs2.Stream +import javax.swing.text.rtf.RTFEditorKit + +import scala.util.Try + +object RtfExtract { + + val rtfType = MimeType.application("rtf") + + def get(is: InputStream): Either[Throwable, String] = + Try { + val kit = new RTFEditorKit() + val doc = kit.createDefaultDocument() + kit.read(is, doc, 0) + doc.getText(0, doc.getLength).trim + }.toEither + + def get[F[_]: Sync](data: Stream[F, Byte]): F[Either[Throwable, String]] = + data.compile.to(Array).map(new ByteArrayInputStream(_)).map(get) +} diff --git a/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala similarity index 62% rename from modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala rename to modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala index f9e94ec2..ac2998a8 100644 --- a/modules/text/src/test/scala/docspell/text/ocr/TextExtractionSuite.scala +++ b/modules/extract/src/test/scala/docspell/extract/ocr/TextExtractionSuite.scala @@ -1,16 +1,19 @@ -package docspell.text.ocr +package docspell.extract.ocr import cats.effect.IO -import docspell.text.TestFiles +import docspell.common.Logger +import docspell.files.TestFiles import minitest.SimpleTestSuite object TextExtractionSuite extends SimpleTestSuite { import TestFiles._ + val logger = Logger.log4s[IO](org.log4s.getLogger) + test("extract english pdf") { ignore() val text = TextExtract - .extract[IO](letterSourceEN, blocker, "eng", Config.default) + .extract[IO](letterSourceEN, blocker, logger, "eng", OcrConfig.default) .compile .lastOrError .unsafeRunSync() @@ -21,7 +24,7 @@ object TextExtractionSuite extends SimpleTestSuite { ignore() val expect = TestFiles.letterDEText val extract = TextExtract - .extract[IO](letterSourceDE, blocker, "deu", Config.default) + .extract[IO](letterSourceDE, blocker, logger, "deu", OcrConfig.default) .compile .lastOrError .unsafeRunSync() diff --git a/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala new file mode 100644 index 00000000..00189e10 --- /dev/null +++ b/modules/extract/src/test/scala/docspell/extract/odf/OdfExtractTest.scala @@ -0,0 +1,28 @@ +package docspell.extract.odf + +import cats.effect._ +import docspell.files.{ExampleFiles, TestFiles} +import minitest.SimpleTestSuite + +object OdfExtractTest extends SimpleTestSuite { + val blocker = TestFiles.blocker + implicit val CS = TestFiles.CS + + val files = List( + ExampleFiles.examples_sample_odt -> 6372, + ExampleFiles.examples_sample_ods -> 717 + ) + + test("test extract from odt") { + files.foreach { case (file, len) => + val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity) + val str1 = OdfExtract.get(is).fold(throw _, identity) + assertEquals(str1.length, len) + + val data = file.readURL[IO](8192, blocker) + val str2 = OdfExtract.get[IO](data).unsafeRunSync().fold(throw _, identity) + assertEquals(str2, str1) + } + } + +} diff --git a/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala new file mode 100644 index 00000000..4d06be76 --- /dev/null +++ b/modules/extract/src/test/scala/docspell/extract/pdfbox/PdfboxExtractTest.scala @@ -0,0 +1,48 @@ +package docspell.extract.pdfbox + +import cats.effect._ +import docspell.files.{ExampleFiles, TestFiles} +import minitest.SimpleTestSuite + +object PdfboxExtractTest extends SimpleTestSuite { + val blocker = TestFiles.blocker + implicit val CS = TestFiles.CS + + val textPDFs = List( + ExampleFiles.letter_de_pdf -> TestFiles.letterDEText, + ExampleFiles.letter_en_pdf -> TestFiles.letterENText + ) + + test("extract text from text PDFs by inputstream") { + textPDFs.foreach { + case (file, txt) => + val url = file.toJavaUrl.fold(sys.error, identity) + val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) + val received = removeFormatting(str) + val expect = removeFormatting(txt) + assertEquals(received, expect) + } + } + + test("extract text from text PDFs via Stream") { + textPDFs.foreach { + case (file, txt) => + val data = file.readURL[IO](8192, blocker) + val str = PdfboxExtract.get(data).unsafeRunSync().fold(throw _, identity) + val received = removeFormatting(str) + val expect = removeFormatting(txt) + assertEquals(received, expect) + } + } + + test("extract text from image PDFs") { + val url = ExampleFiles.scanner_pdf13_pdf.toJavaUrl.fold(sys.error, identity) + + val str = PdfboxExtract.get(url.openStream()).fold(throw _, identity) + + assertEquals(str, "") + } + + private def removeFormatting(str: String): String = + str.replaceAll("[\\s;:.,\\-]+", "").toLowerCase +} diff --git a/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala new file mode 100644 index 00000000..002755bc --- /dev/null +++ b/modules/extract/src/test/scala/docspell/extract/poi/PoiExtractTest.scala @@ -0,0 +1,39 @@ +package docspell.extract.poi + +import cats.effect._ +import docspell.common.MimeTypeHint +import docspell.files.{ExampleFiles, TestFiles} +import minitest.SimpleTestSuite + +object PoiExtractTest extends SimpleTestSuite { + val blocker = TestFiles.blocker + implicit val CS = TestFiles.CS + + val officeFiles = List( + ExampleFiles.examples_sample_doc -> 6241, + ExampleFiles.examples_sample_docx -> 6179, + ExampleFiles.examples_sample_xlsx -> 660, + ExampleFiles.examples_sample_xls -> 660 + ) + + test("extract text from ms office files") { + officeFiles.foreach { + case (file, len) => + val str1 = PoiExtract + .get[IO](file.readURL[IO](8192, blocker), MimeTypeHint.none) + .unsafeRunSync() + .fold(throw _, identity) + + val str2 = PoiExtract + .get[IO]( + file.readURL[IO](8192, blocker), + MimeTypeHint(Some(file.path.segments.last), None) + ) + .unsafeRunSync() + .fold(throw _, identity) + + assertEquals(str1, str2) + assertEquals(str1.length, len) + } + } +} diff --git a/modules/extract/src/test/scala/docspell/extract/rtf/RtfExtractTest.scala b/modules/extract/src/test/scala/docspell/extract/rtf/RtfExtractTest.scala new file mode 100644 index 00000000..699af486 --- /dev/null +++ b/modules/extract/src/test/scala/docspell/extract/rtf/RtfExtractTest.scala @@ -0,0 +1,14 @@ +package docspell.extract.rtf + +import docspell.files.ExampleFiles +import minitest.SimpleTestSuite + +object RtfExtractTest extends SimpleTestSuite { + + test("extract text from rtf using java input-stream") { + val file = ExampleFiles.examples_sample_rtf + val is = file.toJavaUrl.map(_.openStream()).fold(sys.error, identity) + val str = RtfExtract.get(is).fold(throw _, identity) + assertEquals(str.length, 7342) + } +} diff --git a/modules/files/src/main/scala/docspell/files/Dimension.scala b/modules/files/src/main/scala/docspell/files/Dimension.scala new file mode 100644 index 00000000..44025311 --- /dev/null +++ b/modules/files/src/main/scala/docspell/files/Dimension.scala @@ -0,0 +1,9 @@ +package docspell.files + +case class Dimension(width: Int, height: Int) { + + def product = width * height + + def toAwtDimension: java.awt.Dimension = + new java.awt.Dimension(width, height) +} diff --git a/modules/files/src/main/scala/docspell/files/ImageSize.scala b/modules/files/src/main/scala/docspell/files/ImageSize.scala new file mode 100644 index 00000000..21cd0180 --- /dev/null +++ b/modules/files/src/main/scala/docspell/files/ImageSize.scala @@ -0,0 +1,61 @@ +package docspell.files + +import java.io.{ByteArrayInputStream, InputStream} +import java.nio.file.Path + +import cats.implicits._ +import cats.effect._ +import fs2.Stream +import javax.imageio.stream.{FileImageInputStream, ImageInputStream} +import javax.imageio.{ImageIO, ImageReader} + +import scala.jdk.CollectionConverters._ +import scala.util.{Try, Using} + +object ImageSize { + + /** Return the image size from its header without reading + * the whole image into memory. + */ + def get(file: Path): Option[Dimension] = + Using(new FileImageInputStream(file.toFile))(getDimension).toOption.flatten + + /** Return the image size from its header without reading + * the whole image into memory. + */ + def get(in: InputStream): Option[Dimension] = + Option(ImageIO.createImageInputStream(in)).flatMap(getDimension) + + /** Return the image size from its header without reading + * the whole image into memory. + */ + def get[F[_]: Sync](data: Stream[F, Byte]): F[Option[Dimension]] = { + data.take(768).compile.to(Array).map(ar => { + val iis = ImageIO.createImageInputStream(new ByteArrayInputStream(ar)) + if (iis == null) sys.error("no reader given for the array") + else getDimension(iis) + }) + } + + private def getDimension(in: ImageInputStream): Option[Dimension] = + ImageIO + .getImageReaders(in) + .asScala + .to(LazyList) + .collectFirst(Function.unlift { reader => + val dim = getDimension(in, reader).toOption + reader.dispose() + dim + }) + + private def getDimension( + in: ImageInputStream, + reader: ImageReader + ): Either[Throwable, Dimension] = + Try { + reader.setInput(in) + val width = reader.getWidth(reader.getMinIndex) + val height = reader.getHeight(reader.getMinIndex) + Dimension(width, height) + }.toEither +} diff --git a/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala similarity index 56% rename from modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala rename to modules/files/src/main/scala/docspell/files/TikaMimetype.scala index 5c90c728..88b95874 100644 --- a/modules/text/src/main/scala/docspell/text/ocr/TikaMimetype.scala +++ b/modules/files/src/main/scala/docspell/files/TikaMimetype.scala @@ -1,13 +1,18 @@ -package docspell.text.ocr +package docspell.files + +import java.io.BufferedInputStream +import java.nio.file.{Files, Path} import cats.implicits._ import cats.effect.Sync -import docspell.common.MimeType +import docspell.common._ import fs2.Stream import org.apache.tika.config.TikaConfig import org.apache.tika.metadata.{HttpHeaders, Metadata, TikaMetadataKeys} import org.apache.tika.mime.MediaType +import scala.util.Using + object TikaMimetype { private val tika = new TikaConfig().getDetector @@ -35,7 +40,20 @@ object TikaMimetype { private def fromBytes(bv: Array[Byte], hint: MimeTypeHint): MimeType = convert(tika.detect(new java.io.ByteArrayInputStream(bv), makeMetadata(hint))) - def detect[F[_]: Sync](data: Stream[F, Byte]): F[MimeType] = - data.take(1024).compile.toVector.map(bytes => fromBytes(bytes.toArray, MimeTypeHint.none)) + def detect[F[_]: Sync](data: Stream[F, Byte], hint: MimeTypeHint): F[MimeType] = + data.take(64).compile.toVector.map(bytes => fromBytes(bytes.toArray, hint)) + def resolve[F[_]: Sync](dt: DataType, data: Stream[F, Byte]): F[MimeType] = + dt match { + case DataType.Exact(mt) => mt.pure[F] + case DataType.Hint(hint) => TikaMimetype.detect(data, hint) + } + + def detect[F[_]: Sync](file: Path): F[MimeType] = + Sync[F].delay { + val hint = MimeTypeHint.filename(file.getFileName.toString) + Using(new BufferedInputStream(Files.newInputStream(file), 64))({ in => + convert(tika.detect(in, makeMetadata(hint))) + }).toEither + }.rethrow } diff --git a/modules/files/src/test/resources/bombs/20K-gray.jpeg b/modules/files/src/test/resources/bombs/20K-gray.jpeg new file mode 100644 index 00000000..4804bb10 Binary files /dev/null and b/modules/files/src/test/resources/bombs/20K-gray.jpeg differ diff --git a/modules/files/src/test/resources/bombs/20K-gray.png b/modules/files/src/test/resources/bombs/20K-gray.png new file mode 100644 index 00000000..66d8b0a4 Binary files /dev/null and b/modules/files/src/test/resources/bombs/20K-gray.png differ diff --git a/modules/files/src/test/resources/bombs/20K-rgb.jpeg b/modules/files/src/test/resources/bombs/20K-rgb.jpeg new file mode 100644 index 00000000..a4ef7bf6 Binary files /dev/null and b/modules/files/src/test/resources/bombs/20K-rgb.jpeg differ diff --git a/modules/files/src/test/resources/bombs/20K-rgb.png b/modules/files/src/test/resources/bombs/20K-rgb.png new file mode 100644 index 00000000..cf332e53 Binary files /dev/null and b/modules/files/src/test/resources/bombs/20K-rgb.png differ diff --git a/modules/files/src/test/resources/camera/letter-en.jpg b/modules/files/src/test/resources/camera/letter-en.jpg new file mode 100644 index 00000000..f5da55e7 Binary files /dev/null and b/modules/files/src/test/resources/camera/letter-en.jpg differ diff --git a/modules/files/src/test/resources/camera/letter-en.png b/modules/files/src/test/resources/camera/letter-en.png new file mode 100644 index 00000000..d04e8d13 Binary files /dev/null and b/modules/files/src/test/resources/camera/letter-en.png differ diff --git a/modules/files/src/test/resources/camera/letter-en.tiff b/modules/files/src/test/resources/camera/letter-en.tiff new file mode 100644 index 00000000..62c936a0 Binary files /dev/null and b/modules/files/src/test/resources/camera/letter-en.tiff differ diff --git a/modules/files/src/test/resources/examples/sample.doc b/modules/files/src/test/resources/examples/sample.doc new file mode 100644 index 00000000..9423c5a4 Binary files /dev/null and b/modules/files/src/test/resources/examples/sample.doc differ diff --git a/modules/files/src/test/resources/examples/sample.docx b/modules/files/src/test/resources/examples/sample.docx new file mode 100644 index 00000000..365b8e19 Binary files /dev/null and b/modules/files/src/test/resources/examples/sample.docx differ diff --git a/modules/files/src/test/resources/examples/sample.ods b/modules/files/src/test/resources/examples/sample.ods new file mode 100644 index 00000000..a70063e2 Binary files /dev/null and b/modules/files/src/test/resources/examples/sample.ods differ diff --git a/modules/files/src/test/resources/examples/sample.odt b/modules/files/src/test/resources/examples/sample.odt new file mode 100644 index 00000000..51a59ff7 Binary files /dev/null and b/modules/files/src/test/resources/examples/sample.odt differ diff --git a/modules/files/src/test/resources/examples/sample.rtf b/modules/files/src/test/resources/examples/sample.rtf new file mode 100644 index 00000000..6e08e5fe --- /dev/null +++ b/modules/files/src/test/resources/examples/sample.rtf @@ -0,0 +1,697 @@ +{\rtf1\ansi\deff3\adeflang1025 +{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\froman\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\froman\fprq2\fcharset0 Symbol;}{\f6\froman\fprq2\fcharset0 OpenSymbol{\*\falt Arial Unicode MS};}{\f7\froman\fprq2\fcharset0 DejaVu Sans;}{\f8\froman\fprq2\fcharset0 Open Sans{\*\falt Arial};}{\f9\fnil\fprq2\fcharset0 Droid Sans Fallback;}{\f10\fnil\fprq2\fcharset0 OpenSymbol{\*\falt Arial Unicode MS};}{\f11\fnil\fprq2\fcharset0 DejaVu Sans;}{\f12\fnil\fprq2\fcharset0 Open Sans{\*\falt Arial};}{\f13\fnil\fprq2\fcharset0 FreeSans;}{\f14\fnil\fprq2\fcharset0 Symbol;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;\red0\green0\blue10;\red0\green0\blue1;} +{\stylesheet{\s0\snext0\ql\nowidctlpar\hyphpar0\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\kerning0\loch\f3\fs24\lang1033 Normal;} +{\s1\sbasedon50\snext1\ql\nowidctlpar\hyphpar0\sb240\sa120\keepn\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs36\alang1081\ab\loch\f4\fs36\lang1033 Heading 1;} +{\s2\sbasedon50\snext2\ql\nowidctlpar\hyphpar0\sb200\sa120\keepn\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs32\alang1081\ab\loch\f4\fs32\lang1033 Heading 2;} +{\s3\sbasedon50\snext3\ql\nowidctlpar\hyphpar0\sb140\sa120\keepn\ltrpar\cf15\b\dbch\af9\langfe2052\dbch\af13\afs28\alang1081\ab\loch\f4\fs28\lang1033 Heading 3;} +{\*\cs15\snext15 WW8Num1z0;} +{\*\cs16\snext16 WW8Num1z1;} +{\*\cs17\snext17 WW8Num1z2;} +{\*\cs18\snext18 WW8Num1z3;} +{\*\cs19\snext19 WW8Num1z4;} +{\*\cs20\snext20 WW8Num1z5;} +{\*\cs21\snext21 WW8Num1z6;} +{\*\cs22\snext22 WW8Num1z7;} +{\*\cs23\snext23 WW8Num1z8;} +{\*\cs24\snext24 WW8Num2z0;} +{\*\cs25\snext25 WW8Num2z1;} +{\*\cs26\snext26 WW8Num2z2;} +{\*\cs27\snext27 WW8Num2z3;} +{\*\cs28\snext28 WW8Num2z4;} +{\*\cs29\snext29 WW8Num2z5;} +{\*\cs30\snext30 WW8Num2z6;} +{\*\cs31\snext31 WW8Num2z7;} +{\*\cs32\snext32 WW8Num2z8;} +{\*\cs33\snext33\dbch\af10\loch\f5 WW8Num3z0;} +{\*\cs34\snext34\dbch\af10\loch\f6 WW8Num3z1;} +{\*\cs35\snext35\dbch\af10\dbch\af10\loch\f6 Bullets;} +{\*\cs36\snext36\cf9\ul\ulc0\langfe255\alang255\lang255 Internet Link;} +{\*\cs37\snext37\cf13\ul\ulc0\langfe255\alang255\lang255 Visited Internet Link;} +{\*\cs38\snext38\dbch\af14 ListLabel 1;} +{\*\cs39\snext39\dbch\af10 ListLabel 2;} +{\*\cs40\snext40\b0\dbch\af14\loch\f7\fs21 ListLabel 3;} +{\*\cs41\snext41\dbch\af10 ListLabel 4;} +{\*\cs42\snext42\dbch\af10 ListLabel 5;} +{\*\cs43\snext43\dbch\af14 ListLabel 6;} +{\*\cs44\snext44\dbch\af10 ListLabel 7;} +{\*\cs45\snext45\dbch\af10 ListLabel 8;} +{\*\cs46\snext46\dbch\af14 ListLabel 9;} +{\*\cs47\snext47\dbch\af10 ListLabel 10;} +{\*\cs48\snext48\dbch\af10 ListLabel 11;} +{\*\cs49\snext49\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\loch\f7\fs21 ListLabel 12;} +{\s50\sbasedon0\snext51\ql\nowidctlpar\hyphpar0\sb240\sa120\keepn\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs28\alang1081\loch\f4\fs28\lang1033 Heading;} +{\s51\sbasedon0\snext51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033 Text Body;} +{\s52\sbasedon51\snext52\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033 List;} +{\s53\sbasedon0\snext53\ql\nowidctlpar\hyphpar0\sb120\sa120\noline\ltrpar\cf17\i\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\ai\loch\f3\fs24\lang1033 Caption;} +{\s54\sbasedon0\snext54\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033 Index;} +{\s55\sbasedon0\snext55\ql\nowidctlpar\hyphpar0\li567\ri567\lin567\rin567\fi0\sb0\sa283\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033 Quotations;} +{\s56\sbasedon50\snext56\qc\nowidctlpar\hyphpar0\sb240\sa120\keepn\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs56\alang1081\ab\loch\f4\fs56\lang1033 Title;} +{\s57\sbasedon50\snext57\qc\nowidctlpar\hyphpar0\sb60\sa120\keepn\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs36\alang1081\loch\f4\fs36\lang1033 Subtitle;} +{\s58\sbasedon0\snext58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033 Table Contents;} +{\s59\sbasedon58\snext59\qc\nowidctlpar\hyphpar0\noline\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\ab\loch\f3\fs24\lang1033 Table Heading;} +}{\*\listtable{\list\listtemplateid1 +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi-432\li792} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi-576\li936} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi-720\li1080} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi-864\li1224} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi-1008\li1368} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi-1152\li1512} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi-1296\li1656} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi-1440\li1800} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi-1584\li1944}\listid1} +{\list\listtemplateid2 +{\listlevel\levelnfc23\leveljc0\levelstartat1\levelfollow0{\leveltext \'01\u61623 ?;}{\levelnumbers;}\f15\b0\dbch\af14\fi-360\li720} +{\listlevel\levelnfc23\leveljc0\levelstartat1\levelfollow0{\leveltext \'01\u9702 ?;}{\levelnumbers;}\f16\dbch\af10\fi-360\li1080} +{\listlevel\levelnfc23\leveljc0\levelstartat1\levelfollow0{\leveltext \'01\u9642 ?;}{\levelnumbers;}\f16\dbch\af10\fi-360\li1440} +{\listlevel\levelnfc23\leveljc0\levelstartat1\levelfollow0{\leveltext \'01\u61623 ?;}{\levelnumbers;}\f15\dbch\af14\fi-360\li1800} +{\listlevel\levelnfc23\leveljc0\levelstartat1\levelfollow0{\leveltext \'01\u9702 ?;}{\levelnumbers;}\f16\dbch\af10\fi-360\li2160} +{\listlevel\levelnfc23\leveljc0\levelstartat1\levelfollow0{\leveltext \'01\u9642 ?;}{\levelnumbers;}\f16\dbch\af10\fi-360\li2520} +{\listlevel\levelnfc23\leveljc0\levelstartat1\levelfollow0{\leveltext \'01\u61623 ?;}{\levelnumbers;}\f15\dbch\af14\fi-360\li2880} +{\listlevel\levelnfc23\leveljc0\levelstartat1\levelfollow0{\leveltext \'01\u9702 ?;}{\levelnumbers;}\f16\dbch\af10\fi-360\li3240} +{\listlevel\levelnfc23\leveljc0\levelstartat1\levelfollow0{\leveltext \'01\u9642 ?;}{\levelnumbers;}\f16\dbch\af10\fi-360\li3600}\listid2} +{\list\listtemplateid3 +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi0\li0} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi0\li0} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi0\li0} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi0\li0} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi0\li0} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi0\li0} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi0\li0} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi0\li0} +{\listlevel\levelnfc255\leveljc0\levelstartat1\levelfollow2{\leveltext \'00;}{\levelnumbers;}\fi0\li0}\listid3} +}{\listoverridetable{\listoverride\listid1\listoverridecount0\ls1}{\listoverride\listid2\listoverridecount0\ls2}{\listoverride\listid3\listoverridecount0\ls3}}{\*\generator LibreOffice/6.0.7.3$Linux_X86_64 LibreOffice_project/00m0$Build-3}{\info{\creatim\yr2017\mo8\dy2\hr11\min9}{\revtim\yr2019\mo9\dy21\hr14\min2}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709 +\hyphauto0\viewscale100 +{\*\pgdsctbl +{\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Default Style;}} +\formshade{\*\pgdscno0}\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc\htmautsp +{\*\ftnsep\chftnsep}\viewbksp1{\*\background{\shp{\*\shpinst{\sp{\sn shapeType}{\sv 1}}{\sp{\sn fillColor}{\sv 16777215}}}}}\pgndec\pard\plain \s56\qc\nowidctlpar\hyphpar0\sb240\sa120\keepn\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs56\alang1081\ab\loch\f4\fs56\lang1033\qc\sb240\sa120{\cbpat8\cbpat8\fs21\rtlch \ltrch\loch +Lorem ipsum } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 + +\par \pard\plain \s1\ql\nowidctlpar\hyphpar0\sb240\sa120\keepn\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs36\alang1081\ab\loch\f4\fs36\lang1033{\listtext\pard\plain }\ilvl0\ls1 \li792\ri0\lin792\rin0\fi-432\li0\ri0\lin0\rin0\fi-432\sb240\sa120\keepn{\rtlch \ltrch\loch +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. }{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b\dbch\af11\ab\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. }{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. }{\scaps0\caps0\cf1\expnd0\expndtw0\i\b0\dbch\af11\ai\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. }{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.} +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.} +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033{\listtext\pard\plain \b0\dbch\af14\loch\f7\fs21 \u61623\'3f\tab}\ilvl0\ls2 \li720\ri0\lin720\rin0\fi-360\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b\dbch\af11\ab\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Maecenas non lorem quis tellus placerat varius. } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033{\listtext\pard\plain \b0\dbch\af14\loch\f7\fs21 \u61623\'3f\tab}\ilvl0\ls2 \li720\ri0\lin720\rin0\fi-360\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i\b0\dbch\af11\ai\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Nulla facilisi. } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033{\listtext\pard\plain \b0\dbch\af14\loch\f7\fs21 \u61623\'3f\tab}\ilvl0\ls2 \li720\ri0\lin720\rin0\fi-360\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\ul\ulc0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Aenean congue fringilla justo ut aliquam. } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033{\listtext\pard\plain \b0\dbch\af14\loch\f7\fs21 \u61623\'3f\tab}\ilvl0\ls2 \li720\ri0\lin720\rin0\fi-360\qj\widctlpar\sb0\sa225{{\field{\*\fldinst HYPERLINK "https://products.office.com/en-us/word" }{\fldrslt {\cs36\cf9\ul\ulc0\langfe255\alang255\lang255\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Mauris id ex erat. }{}}}\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033{\listtext\pard\plain \b0\dbch\af14\loch\f7\fs21 \u61623\'3f\tab}\ilvl0\ls2 \li720\ri0\lin720\rin0\fi-360\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Morbi viverra semper lorem nec molestie. } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033{\listtext\pard\plain \b0\dbch\af14\loch\f7\fs21 \u61623\'3f\tab}\ilvl0\ls2 \li720\ri0\lin720\rin0\fi-360\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.} +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +In non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.} +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +In eleifend velit vitae libero sollicitudin euismod. Fusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 + +\par \pard\plain \s1\ql\nowidctlpar\hyphpar0\sb240\sa120\keepn\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs36\alang1081\ab\loch\f4\fs36\lang1033{\listtext\pard\plain }\ilvl0\ls1 \li792\ri0\lin792\rin0\fi-432\li0\ri0\lin0\rin0\fi-432\sb240\sa120\keepn{\rtlch \ltrch\loch +Cras fringilla ipsum magna, in fringilla dui commodo a.} +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 + +\par \trowd\trql\trleft53\ltrrow\trrh450\trpaddft3\trpaddt0\trpaddfl3\trpaddl0\trpaddfb3\trpaddb0\trpaddfr3\trpaddr0\clbrdrt\brdrs\brdrw5\brdrcf18\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx770\clbrdrt\brdrs\brdrw5\brdrcf18\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx6434\clbrdrt\brdrs\brdrw5\brdrcf18\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx7992\clbrdrt\brdrs\brdrw5\brdrcf18\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clbrdrr\brdrs\brdrw5\brdrcf18\clpadfr3\clpadr55\clcbpat8\cellx9690\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\widctlpar\sb0\sa225\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +\cell\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Lorem ipsum}\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Lorem ipsum}\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Lorem ipsum}\cell\row\pard\trowd\trql\trleft53\ltrrow\trpaddft3\trpaddt0\trpaddfl3\trpaddl0\trpaddfb3\trpaddb0\trpaddfr3\trpaddr0\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx770\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx6434\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx7992\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clbrdrr\brdrs\brdrw5\brdrcf18\clpadfr3\clpadr55\clcbpat8\cellx9690\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +1}\cell\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +In eleifend velit vitae libero sollicitudin euismod.}\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Lorem}\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +\cell\row\pard\trowd\trql\trleft53\ltrrow\trpaddft3\trpaddt0\trpaddfl3\trpaddl0\trpaddfb3\trpaddb0\trpaddfr3\trpaddr0\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx770\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx6434\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx7992\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clbrdrr\brdrs\brdrw5\brdrcf18\clpadfr3\clpadr55\clcbpat8\cellx9690\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +2}\cell\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Cras fringilla ipsum magna, in fringilla dui commodo a.}\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Ipsum}\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +\cell\row\pard\trowd\trql\trleft53\ltrrow\trpaddft3\trpaddt0\trpaddfl3\trpaddl0\trpaddfb3\trpaddb0\trpaddfr3\trpaddr0\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx770\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx6434\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx7992\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clbrdrr\brdrs\brdrw5\brdrcf18\clpadfr3\clpadr55\clcbpat8\cellx9690\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +3}\cell\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\ab\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Fusce vitae vestibulum velit. }\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Lorem}\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +\cell\row\pard\trowd\trql\trleft53\ltrrow\trpaddft3\trpaddt0\trpaddfl3\trpaddl0\trpaddfb3\trpaddb0\trpaddfr3\trpaddr0\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx770\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx6434\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clpadfr3\clpadr55\clcbpat8\cellx7992\clpadfl3\clpadl55\clbrdrl\brdrs\brdrw5\brdrcf18\clpadft3\clpadt51\clbrdrb\brdrs\brdrw5\brdrcf18\clpadfb3\clpadb55\clbrdrr\brdrs\brdrw5\brdrcf18\clpadfr3\clpadr55\clcbpat8\cellx9690\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +4}\cell\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Etiam vehicula luctus fermentum.}\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql{\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Ipsum}\cell\pard\plain \s58\ql\nowidctlpar\hyphpar0\noline\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\intbl\ql\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +\cell\row\pard\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af11\rtlch \ltrch\loch\fs21\loch\f7\hich\af7 +Etiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.} +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af12\rtlch \ltrch\loch\fs21\loch\f8\hich\af8 +Maecenas non lorem quis tellus placerat varius. Nulla facilisi. Aenean congue fringilla justo ut aliquam. Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. Morbi viverra semper lorem nec molestie. Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.} +\par \shpwr2\shpwrk3\shpbypara\shpbyignore\shptop0\shpbxcolumn\shpbxignore\shpleft2819\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch +{\*\flymaincnt5\flyanchor0\flycntnt}{\shp{\*\shpinst\shpwr2\shpwrk3\shpbypara\shpbyignore\shptop0\shpbottom2660\shpbxcolumn\shpbxignore\shpleft2819\shpright6819{\sp{\sn shapeType}{\sv 75}}{\sp{\sn wzDescription}{\sv }}{\sp{\sn wzName}{\sv }}{\sp{\sn pib}{\sv {\pict\picscalex100\picscaley100\piccropl0\piccropr0\piccropt0\piccropb0\picw200\pich133\picwgoal4000\pichgoal2660\jpegblip +ffd8ffe000104a46494600010101004800480000ffe20c584943435f50524f46494c4500010100000c484c696e6f021000006d6e74725247422058595a2007ce +00020009000600310000616373704d5346540000000049454320735247420000000000000000000000000000f6d6000100000000d32d48502020000000000000 +00000000000000000000000000000000000000000000000000000000000000000000000000000000001163707274000001500000003364657363000001840000 +006c77747074000001f000000014626b707400000204000000147258595a00000218000000146758595a0000022c000000146258595a0000024000000014646d +6e640000025400000070646d6464000002c400000088767565640000034c0000008676696577000003d4000000246c756d69000003f8000000146d6561730000 +040c0000002474656368000004300000000c725452430000043c0000080c675452430000043c0000080c625452430000043c0000080c7465787400000000436f +70797269676874202863292031393938204865776c6574742d5061636b61726420436f6d70616e79000064657363000000000000001273524742204945433631 +3936362d322e31000000000000000000000012735247422049454336313936362d322e3100000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000058595a20000000000000f35100010000000116cc58595a20000000000000000000000000000000005859 +5a200000000000006fa2000038f50000039058595a2000000000000062990000b785000018da58595a2000000000000024a000000f840000b6cf646573630000 +00000000001649454320687474703a2f2f7777772e6965632e636800000000000000000000001649454320687474703a2f2f7777772e6965632e636800000000 +00000000000000000000000000000000000000000000000000000000000000000000000000000000000064657363000000000000002e4945432036313936362d +322e312044656661756c742052474220636f6c6f7572207370616365202d207352474200000000000000000000002e4945432036313936362d322e3120446566 +61756c742052474220636f6c6f7572207370616365202d20735247420000000000000000000000000000000000000000000064657363000000000000002c5265 +666572656e63652056696577696e6720436f6e646974696f6e20696e2049454336313936362d322e3100000000000000000000002c5265666572656e63652056 +696577696e6720436f6e646974696f6e20696e2049454336313936362d322e310000000000000000000000000000000000000000000000000000766965770000 +00000013a4fe00145f2e0010cf140003edcc0004130b00035c9e0000000158595a2000000000004c09560050000000571fe76d65617300000000000000010000 +00000000000000000000000000000000028f0000000273696720000000004352542063757276000000000000040000000005000a000f00140019001e00230028 +002d00320037003b00400045004a004f00540059005e00630068006d00720077007c00810086008b00900095009a009f00a400a900ae00b200b700bc00c100c6 +00cb00d000d500db00e000e500eb00f000f600fb01010107010d01130119011f0125012b01320138013e0145014c0152015901600167016e0175017c0183018b +0192019a01a101a901b101b901c101c901d101d901e101e901f201fa0203020c0214021d0226022f02380241024b0254025d02670271027a0284028e029802a2 +02ac02b602c102cb02d502e002eb02f50300030b03160321032d03380343034f035a03660372037e038a039603a203ae03ba03c703d303e003ec03f904060413 +0420042d043b0448045504630471047e048c049a04a804b604c404d304e104f004fe050d051c052b053a05490558056705770586059605a605b505c505d505e5 +05f6060606160627063706480659066a067b068c069d06af06c006d106e306f507070719072b073d074f076107740786079907ac07bf07d207e507f8080b081f +08320846085a086e0882089608aa08be08d208e708fb09100925093a094f09640979098f09a409ba09cf09e509fb0a110a270a3d0a540a6a0a810a980aae0ac5 +0adc0af30b0b0b220b390b510b690b800b980bb00bc80be10bf90c120c2a0c430c5c0c750c8e0ca70cc00cd90cf30d0d0d260d400d5a0d740d8e0da90dc30dde +0df80e130e2e0e490e640e7f0e9b0eb60ed20eee0f090f250f410f5e0f7a0f960fb30fcf0fec1009102610431061107e109b10b910d710f511131131114f116d +118c11aa11c911e81207122612451264128412a312c312e31303132313431363138313a413c513e5140614271449146a148b14ad14ce14f01512153415561578 +159b15bd15e0160316261649166c168f16b216d616fa171d17411765178917ae17d217f7181b18401865188a18af18d518fa19201945196b199119b719dd1a04 +1a2a1a511a771a9e1ac51aec1b141b3b1b631b8a1bb21bda1c021c2a1c521c7b1ca31ccc1cf51d1e1d471d701d991dc31dec1e161e401e6a1e941ebe1ee91f13 +1f3e1f691f941fbf1fea20152041206c209820c420f0211c2148217521a121ce21fb22272255228222af22dd230a23382366239423c223f0241f244d247c24ab +24da250925382568259725c725f726272657268726b726e827182749277a27ab27dc280d283f287128a228d429062938296b299d29d02a022a352a682a9b2acf +2b022b362b692b9d2bd12c052c392c6e2ca22cd72d0c2d412d762dab2de12e162e4c2e822eb72eee2f242f5a2f912fc72ffe3035306c30a430db3112314a3182 +31ba31f2322a3263329b32d4330d3346337f33b833f1342b3465349e34d83513354d358735c235fd3637367236ae36e937243760379c37d738143850388c38c8 +39053942397f39bc39f93a363a743ab23aef3b2d3b6b3baa3be83c273c653ca43ce33d223d613da13de03e203e603ea03ee03f213f613fa23fe24023406440a6 +40e74129416a41ac41ee4230427242b542f7433a437d43c044034447448a44ce45124555459a45de4622466746ab46f04735477b47c04805484b489148d7491d +496349a949f04a374a7d4ac44b0c4b534b9a4be24c2a4c724cba4d024d4a4d934ddc4e254e6e4eb74f004f494f934fdd5027507150bb51065150519b51e65231 +527c52c75313535f53aa53f65442548f54db5528557555c2560f565c56a956f75744579257e0582f587d58cb591a596959b85a075a565aa65af55b455b955be5 +5c355c865cd65d275d785dc95e1a5e6c5ebd5f0f5f615fb36005605760aa60fc614f61a261f56249629c62f06343639763eb6440649464e9653d659265e7663d +669266e8673d679367e9683f689668ec6943699a69f16a486a9f6af76b4f6ba76bff6c576caf6d086d606db96e126e6b6ec46f1e6f786fd1702b708670e0713a +719571f0724b72a67301735d73b87414747074cc7528758575e1763e769b76f8775677b37811786e78cc792a798979e77a467aa57b047b637bc27c217c817ce1 +7d417da17e017e627ec27f237f847fe5804780a8810a816b81cd8230829282f4835783ba841d848084e3854785ab860e867286d7873b879f8804886988ce8933 +899989fe8a648aca8b308b968bfc8c638cca8d318d988dff8e668ece8f368f9e9006906e90d6913f91a89211927a92e3934d93b69420948a94f4955f95c99634 +969f970a977597e0984c98b89924999099fc9a689ad59b429baf9c1c9c899cf79d649dd29e409eae9f1d9f8b9ffaa069a0d8a147a1b6a226a296a306a376a3e6 +a456a4c7a538a5a9a61aa68ba6fda76ea7e0a852a8c4a937a9a9aa1caa8fab02ab75abe9ac5cacd0ad44adb8ae2daea1af16af8bb000b075b0eab160b1d6b24b +b2c2b338b3aeb425b49cb513b58ab601b679b6f0b768b7e0b859b8d1b94ab9c2ba3bbab5bb2ebba7bc21bc9bbd15bd8fbe0abe84beffbf7abff5c070c0ecc167 +c1e3c25fc2dbc358c3d4c451c4cec54bc5c8c646c6c3c741c7bfc83dc8bcc93ac9b9ca38cab7cb36cbb6cc35ccb5cd35cdb5ce36ceb6cf37cfb8d039d0bad13c +d1bed23fd2c1d344d3c6d449d4cbd54ed5d1d655d6d8d75cd7e0d864d8e8d96cd9f1da76dafbdb80dc05dc8add10dd96de1cdea2df29dfafe036e0bde144e1cc +e253e2dbe363e3ebe473e4fce584e60de696e71fe7a9e832e8bce946e9d0ea5beae5eb70ebfbec86ed11ed9cee28eeb4ef40efccf058f0e5f172f1fff28cf319 +f3a7f434f4c2f550f5def66df6fbf78af819f8a8f938f9c7fa57fae7fb77fc07fc98fd29fdbafe4bfedcff6dffffffdb00430005040404040305040404060505 +06080d0808070708100b0c090d131014131210121214171d1914161c1612121a231a1c1e1f212121141924272420261d202120ffdb0043010506060807080f08 +080f201512152020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020ffc2001108008500 +c803011100021101031101ffc4001c0000010501010100000000000000000000020001030405060708ffc4001a01000301010101000000000000000000000102 +0300040506ffda000c03010002100310000001cef90fb09a72952d6a75cbea4d4e63c8fb5e46b47b7ac13eefbbc600d1ab0076056ce0100416acafcf72f764c5 +b2c88cb7775e3dda7264f5754ed3360db03601aa1102d6c81bcfca398411c56ce32d9d4101c847bb834e8bbcea88916beadd3e3db6907534b4989384bd3c6b57 +b1dcfd09846af1ab3660044e582d96ce0b8c977982f6f3d37b5257036b9fabd4fd0f0a4558fa5e5acf3a5d3e2d4eecbbc3604bdbb9a4db0e6118183157c160f8 +ac5d5a347f15a74410725d6263afe6eaf48eef09d7074996aa0a7cea3eb705d52c4e9e4f68e0af54a83b33289472ae558e432c5c3514af8776343ced624d7f9f +a3d1a27b2b798614fb94983918fcddbe1fd74c0e9e4d74deede75e4647644513222110b640a0d842bc5deb940e0149f97b7dab8d374f1985b3e945d821a0957c +a376f95fa3c359e5ed1e7f5773006e8e5132b156216c81607917b60d2a294e7185b8dbd53cdbc926255e8bdaf1d6c2a6347c89f57cf3ea72e3db9b690fbf799d +368cdd95ca265121608e056f34e8b516a62837675ea796fa5e67a4b9fa6ca6f41fa7f91e7bccf5f3d9f03ba16d27c2d5312d0b197b291d645264e9d652ba065a +6af66b2a894f30e8e9c12dae94b894e8b92d67cef4adcab6267b3f67e7208d2bad7396f84dabbcb61277424f49d5a2d869a74074899200d18311192691ea47a9 +5453a1e6b75ca97edcd61e3ccf93ee36c8e00d09d1b4e85a117571eaa248c8656ab2e55467d044da33a326916bd36d293cf3aef21e9001471538fc3e857d5061 +015379d829436c2eff003e769c84390442da26d20d608ead92cbeb20a9b20c4ac919908ccf21e7fbce529d6557a79bb6bf0d20451dcace54ca995321c81d9308 +b1a28d9aadaef3d1759f31024aee19d779d799ee994ad49e7d274a9372b6996f64be8b7535802719f013a225866d9d73111632309e82d93054684df84e1f55ca +46c80c88a8b2d8cb315972332915b886da9914b6cfb2c61c63c50cc0be311780b366c74acd95ca8b21e05945d642b3653c0b006d19138d710ce8c810c6224766 +0483163097af9b9971a73d2e08a8b2cb84a01b239078160632d85b44c076b2bae4d9958098f173833572f51cff00ffc400271000020202020005050101000000 +0000000102000304110512101314152120222330310632ffda0008010100010502af2d5695cb5d26526ce5d7af51597af22a09996abbf1aea178c707337fa6eb +7a0b73ea42f9f6bc2db338bab55388bc6e341c7e34f6fc79edb8f3dab167b56346e130ccf65c5598bc7d78d67e9e62deb429d9dcdcabee7c44eb5b41e0cea82f +e5f168193fe8ec2389cff5b87fb39bbbf3d66769b982bdb2291aa8c1e19950bb1eea4d76389c2e6fa4cf0db1fa9ce9390b7cce401d0ed17e4f169b71f00c1e0c +36398c6eb1c7c3ec1e173c65e16e6ff4e5375aafa6c1779b15a24e213f10fa732916d194a68b9da70f9de93391c3afe9cf7d56ca1a5b88ad2ca1e995640338f4 +e947d2dfcff418718cdce073fd462fe9cfb3ee8cc00bed16b5184ad70b854bea84f553b09dd67710b4cda45d466d268c99c6659c4cda6e4b2bee27713b4dcd89 +b13719beccdb8b65f9c105b9165cd8f474984377dcdf954f835408b0148d7f90afcbdd64a33b2ae4e4b1fb5c38d768386b8cc3c6cfc756f702455c896ae96ea4 +6a7c18287f36c53d2c3d71f27cc46d59735552a0dce3ff00e06364315c6b84f4f6c366e7c464ada3a56b2fed282cb120eb0d9d51722d8320c39680faca8c3954 +439b54f5f5cb6faecacd8b3ad6d0e35261c1130686088ee816c46f0dcdc0e561b0c2e67dd2f6b02d64ec749f6cd88f5abcb53cbf1dee323b0eb9ca557334b4e4 +7508c138dfe7c19d04d389b84fcf710dc04acb5a3a58a2d606b1c960ac1cb7193de38b9ef5c64f7bc09ef58f0729483ef16caf90e46d38eb94cb563574c2cb37 +3e3c35353460b277967f31f1bcf643522e552f90130eae9e8717630f1c4f4f8f3c8a27934cf2699e5d027998c91b35046cdc833d466f9899d5c5bd1e6c78ee6e +6ccdec16962069d6043111a00c225b6082f58b6033737a9e609e66e7e49f93c3e2796860362c195608996a63d8e5aab3ecdf81f99a86a0604eb06a09a9d60dac +5bf5058ad3e3c373b426769b9b3e0c81a75759e75cb036fe9fecd6a0fa089f316c6116f9d819a9a3e3b80cdc30995b180fd0209a83e8d4226a76222d8606dc33 +535e061319a7ffc40029110002020103030402020300000000000000010211031012131420210430314122324051425271ffda0008010301013f01e75b459059 +50b244e58d8e71334d367a56a8f54ed32bd9c58b77c8b11b628b123d5cabc1063f5733abc875990eb6675923ac97f473a7f4473a5f44b2a6bdac3e116596633d +4cae4458f48c1cbe05e95ff90a38e3f08cb15fb445dd6597d88b2cb3e206577310f4f4f3db224a8911ff0051c6bb6cb2cb2cb225963647cb32ba88c43d22c8cb +7c2c648fdd595d97a5e965912d9bcdc6247a97f88fb704e9d13f0c9331ce9d0c631f7c74a2a8c534677e6bba24ff002858c6467ba23efbd13132cbbf04511c5c +96ce94e94dacd8cd8c51313fa322a6331ca98fbdca84c4597645511661f102f48e46998da912499e2bc11b26ace338f56cdcc7214af4cdfa98dfe3aa1310b241 +7d9cd0fece7c7fd8b1d1ff0004e421c91bc721b1f91c46b47a58dd884c421116bec9414be096392d28a1c14858d0e28a4645e0b1fb5626290d9e4de5c5fc95ae +e1e48af937c5fd925e28716b4a36336338d9c6718e2977d965a290c64e7b476fe48cf69d448ea247348e591c9239246f917236c99c7238cd838b2bb28a1bd18f +4a286b4dc5eb456946d28a36238cd86c66dd1fbf5a5695a228da5e8fddb2fd9bfe4fffc400271100020201030305010101010000000000000102110312132110 +3031042022415132714061ffda0008010201013f019fa66e64bd3325e9e42f4f33624912c523d2637147ac5f230aa5da9cabc0db63b12ae9815cac92349b66d2 +364d9364a92fb256fc9cf6a4ad92545144f846055024210e5a4d6df81636fcb3f87a5f7631268d2513e59055148911e993f48cac4ccb1b565f6a86254868d232 +ae5d18844910e1d084648e8976e86b8a1e2fc1c4998d7cafa3285d26ab921cab1232c3547b715cf4b387e4c985a22bd9432460953a174cd0d2fb504328aa272e +192969e0dc1e435a3711ad0e478766376ba648ea4557bd2b1aa20b828aa2465f06456c6868960543e04acc78235726648417831c8dc46ea2528485a0f81a50a0 +38d74c5fd13fe855d19465f234c7063848795b3fd383511ff492ff00d28a170596722b39341b6cd2fecd2726a66b3273e08b6852470596597d71792bd898bd96 +7c075fa3a18d53e95eca349a19a24478628d9b4cdb34234c4f81aa3fa5c054fc15d1f5a1a348e2fa210a3621ad46cc4d98fe9b31fd3661fa6d43f4d981b58cdb +c68bc68de88f39ba9fd8a66b2d76132cd46a2c691a0aaeb451c0ebaea685919b86b1648fd8e4bebda99e7ad96793495dab351657bafdb7d290e3daa1aff81c7b +147fffc40037100001020304080305090100000000000001000203112110223132041213203041517105618123339192a134404450627282b1d1e1ffda000801 +0100063f02ca665652b036628de46ab1512478545576b1574ea8b5bf3599164591645902c817bb0aeb648b982a78520713bbf4dc9b96799f24760c97741e7373 +e2b61f4aee3021e75dc737a84e6bacd471f671388544f2a6e39dd0296eed00b263109b337db43c329d10733357a96877533de2d4e866c13371f42811c222d98c +100534746efed982d0c71bcda70836caad935426cb9a975aee636b9a5399635d3ba6850703c09a701c954ad56299c56b72689a23a53708aa2f72f6508a94766a +29b44d652a8d280ab82a43430015ec6dd6d6a5ae76ccd55682d73bcd4f665642b2158aaaab66a90fe8a90dc7b291d0dcab064b0540aac0b2a92aacca86c9b5f5 +1c955640561257627c56a8a96a93db3dcc563662b321227d148cfe3b955564c750ba85fed0aeaa8d3eaae43791d94e2682f1e6da2d6911e4f0b5ccbd13a2c8c8 +d2da1b3c9557557184a996482241527c6d4eed2bed7f0615efe21ed08afc43bb4357745d31dfc153c374b3e8a70fc0e2cfd02bbe06ff009d4a1781fab9ca7a4b +2143fd2caabbae7f7bcbbfb5592af00a9c58a18cef5283585a02019a4ecc04369126eea1722b92e4b95b8aff008aec37395c820775388dd76f4064af35d0fb85 +4703c19852598ac4aa2a99aad15372eae4ab6cc2bb10faabc26ab4536b95ec787457adc38375df70afe49fffc400281000030001030304020301010000000000 +00011121314161105171208191a1b1c1d1e1f030f1ffda0008010100013f21350b67f11946af629e589cb0d081f4acc914840cd036363f53549a37b9ed0742d9 +e0c759bac6d3c1a175adf8e8a2b47c1d90e30dbb3e0ffc01cc35bf88fd043766e9b0d8dfaf4e0e06f8285d2b3f22302d122bd28bb22e479a8adb2119c3656eca +70bd98d8fa31b2fa9b8a97ab4518af4e93ca159ccfd06f18572c323472e5c2f86216d32946c6fd7745a275309a2a63078105ee6226d8e84ea1f6b086e6658ca9 +fa0084bd6fa6af819836dcc29c2d09794e8f83225facb4fb18a242e8f52b830431e0cc3178301a55a8bff1e56c15aa16bc19a7034b0cb2acaaf96265213a2d0c +4b69af49b2756a8eee784c5e9a5296a3928c2da0b29b753b2468b979c9f8fd0bbc4f725af495d188b04d56a1ca58b81b35f041265a68e53948ee41cc5b71a255 +884c1806301d12d87f717baee3235941b99fd0584f032aa3a450d6178d847715eb0813685b98bf73f8cc6a19914d9b197535dd94eee4516a84eb44b0ab7b12ec +79379c8af2dab52376f68d207112435bd1de64a455e4d60909b20f818a310d01f608f7c2217bc93486bf9670508741b16bf186f59291865d19195028b2b1e9ba +c861b0740b3ca1bf94bb9a7b78b1cdae105ac41b6d77625c15ba34a8636868a3a411455e4677e84dbe45802a1acd72317f627da8e1468ecf71e97eaef23cab16 +a269e4b5ca9cdb007a0978359f935135b27ec5e169921e40370a497217ba29065ca94d6b50e74dec7414cd062fd435ea9e4d29a0ccd42d7036af2356ccbe25c0 +e32cb6fd0423aff8f8124c27a7bc099f426bf7326a2a5b90abadbeed3f826c2e025f26507ee37c96db35eff900ac4d5b1c1d092d887aa38b185387ba1a771ee6 +a9fd1859181558e893171b3a256b125772824cad36ce61a1af8a33dd5ec7f9412fb7d1e1f92aad7e4e27b5185f14a1f7d21e4c1e95271feb22ef8f627956cc57 +b929ca409cb0f28bd4c8f587d0471911453f229334e480d8c6c1c9c80d4744e2860778d0f413e469968124da57b8db6a36211f7583631db23169f03b81c8c98a +3268522b794f27784bb191289b1da44190c5f5cd121a73237664482920cfa0d2e8347a1c827dce00dc23d6e4904c793429204fa3a15137b928d223c1cb8878c3 +a375a31a87ca141f507eb2b746ba309485d48592040c2d0be9d88431a8c21878e48fffda000c030100020003000000106b0631a8f6e44e0c76cfc6a197c34b9c +d5a5e0fb54038412dabf4d200da8ebcf66ea2642070e2ecb9e6ef3daaf46c64184b704900e32fd4ed133cd5d6d80b812412cbe809ae198e77100949233ff002e +20ddb23398c9b0917cb5da4fe182bd6b94f1d5ab100351287c7902e5b68139144dd8dc3e3dfbc7e8199d0098cd819fcd9cdd48ab2249280e782fc18ff9527a96 +29a4292210f465cc52715aba0ea39c6ebfffc400271101010100020202010304030000000000010011213110415161207191f03081b1d1c1e1f1ffda00080103 +01013f104389cc21c966bb8b6ddacdbb30e6e1760317682083f226bd220c3823aa4ead38b03fa0ff009f083a09fabc1fa37d65a708bdc374189867bb39820fcb +21cfe2c672e5900fd5f13966f2139790bd8fdce02e184441e37f003e0359e01702e4ce9c8e66d0fae3c5cd93bee548c266afa7fcda31820ba967f002116b1cae +5c401e28190d75f05360df682ce5bdd2504f71e0b2ccdf0d8847cc03a472e6757b9b1c1fa7efff0050b2596db9974cd62e4b9b74dc65294b2db6c30ca1b2cbe4 +4c0b7112c96d89e3708f64e79653ee52cb2cb6db107cc11db166992afecff3f7956a1babebb1ec9336f6adf2571cf4ca665996d806b6c693e2316f81e27b2e27 +df3fbc4610c28c4625c382f62678cfce4c08632d91c7814eed516ba5b08788d679e0e667cc46407d2c3d6ec246905c6cb4d5847521f5690c0507cc07b9c81663 +a10cb0624b3af0294a224575e37773bee79e463d486a41f57c39f8eed92b64cb2db6cbe46123ede5053211d42e98e82dac664b938bb865bb0e522a771236dfe3 +fead3ff1ff0057f2e7c1ab5768ce7af1b16c30afb453e47ac1acd36777399d38253d4bfa95f536c296deaf84c27647cdb21c42ecf036186356bc0bb2b94e4891 +ebc0697dac3259b6885e00f86acbddadf56d90109ee594b2ccccc925996dbe0b3c02d41f3620b84060782ca6d999667c659e062134f231110c2b65999966dfe8 +0c30dbe06186dbffc4002511010101000203000104020300000000000100111031214151a120618191b1d1c1e1f1ffda0008010201013f10eede090f86cde188 +f522993dcc8e692bb9c16df12cf19659270fe33c87715e178409e6daf81ff5c06bb6036d03d36becfb6c7460fbac5870f1965924927058274c709b8fdff88c61 +1f6ba664e8d58599e72cb2c9249b4f3fa20e05fd99fdff00d42110cf08009c5e3fd91b2db780b2cb2c9261f1c26cce08e118897cf01c3727e45e2d9cf32a1e9b +620820b2cb2c9274842e92463c30bf86e144125a31eac10e2ebbb20fb11041671964969772ed7a45b89642fde18390e9790f00bce9d30701071965925dec2d3d +465aca26c40fe5261b2e15a0dc9f00b098237252c6218638c25584cb1b133bb116c3a7db6794dff3e61acb6789312be7b00796cbdd9f6f1c09e586e9d41eac5b +3d65e1e5115347867833a3f69cee5f0caf575accfd96badbcbb82d59d3b0e6c1c3f90fa12f9914a60dd1bc935ead1eecedfb26b5f6bc026df0c2b82a19698585 +90e5919a5911213de98888076cabb603a8b599fd5e4e136193931d3abc7026143f46c91a93c197af01f6ff008ff71ee7f25876fe4b07b3fbbc38fe565ece5384 +895f778982cfc5ea58c2f58bcac1e0bacf51f68fb22d8c3ff75fbbf982f7f9b56afe63af927d30fa9ec43f4dfbad271e999725d8b320d9cee17636a7ee0bdcff +0036bd4bedc6c6a71df05e9c9d46f623ea2e4c16c84478eb852d8ec6c42a9da7e251df1bfa326ce0642f76271645bc1e235c78b3e43f6268cb4a489c6fe878ce +05267031c8f1bceb0c360c1271b6f19044ffc400261001000202010305000203000000000000010011213141516171108191a1b1c1e1d1f0f1ffda0008010100 +013f106a73081d65206d754f9e528f6a31aa8a2417d25049ef0c54150cf760d9b8d3da69f12ff497a1083061b1dc4ea2a1babdc7d60e1e5f78b969955bb8c17c +acb2b91f134fd21e710016fb200a3e385383f69b3f8517b53f647b0a1e84b0b8b059901a9b0b2e817ea0bea420f58a4c3823486dfc8f7b6b65611c16e282ad41 +19940776a3fb25a306218e8a7955411a7ba1a06ca6d7f04d18d4e0852ac688bd07d443d2e7a08075ac0f2d1f8ca2cd46d45cb20eb0ea2cf8c67f89627917bebe +aa0b187042585d2186a5dc2c05e652c28971c152e3858611136465f4192199999832e0c2b9ac4c7a49ceabfbb950b8fb6287494617479963f2574800f68ac954 +09783a605f03907112399429a85b12565a615e6ce630c36f41a95a8b2e5c194a3a6fd414000f55b9bb0b1981613acc65db29de86f67f83efd16c4a48220950f5 +a554099b5972455371298c1781e18004036730bc1bf55972e5cb9443a7de56098841cc5d79911e91f4e6acc5aa03f90fe7a54a8461182d89b65ce86c88ac30c8 +962710ac42a5729c3e8692e2e22cbf51a0b0ccd9542cadc0c4036cca42a44a7c4bcd6c1b7eae5460cf6b81f16f79c28c18870d546bd23acdccb13307c59ab942 +42df18cb2f1820de29e6186122303dc09c65bc23d6947183282b2f451084e2a5f8a9f76e2b35d6ae38b4d5e04afce655b61ae513f0fd967e5575d453f631516f +a06e2ba8e405e298e834be60758d11b85310e3710b2a106d48a8b5ce5091017820850bd584af37a038250c9e59596f726ea68f8ce66dd656058a7497b36d406e +1a45a95728226bd721cc1c415d8a5825059302b022e3761d0387bc52e2594ab58b51531a9530f78d4a08232c15fb91101744b218b788540f7a181e68f50f961b +bf662621fad162bb827544c5ae521f1a1a5ea092d4a4ea471431d259e9f80cc415172d6a16e501a5973f3a992b7804b4546570a2e2cfd90ce1e4e2c4bf8b9a97 +8aadf785dd6bb12985f45e21535f2c3c08f0c7758f94e2bf12ed50ee131c55d897e308bb1fd4b6fcf65528176bde07af9a5e501ed0ce068e122ceed9f83679d4 +6b0484094ecde6215300ad41d7fac4c933b6781b8a42f217e046258aacdbe496e20bb1b672d388e62054001cda95de51511558e3748665ca8cf4581969ced4f6 +96590f78a6d5792e3fc119468afa89b9d8b648bb674da00d95e0b8ab869188f4bd40fdf68003e61d3460a2b59a8ad18ca6fd930ccbdc7e88a63c95fccb74bc0b +f6156475c2cd48991b175b90edbadb71e0081581c15742794109341b05e5d07b0cbe97b5ff00b595ed53722e09798bc63f0090b180f6250d08e82cb0643ce670 +d18cb7c29cc23b12b657de39758e344c8103a071e61775a1ff00d599a4401dd36e4ed070095f113350f1275bf92a9156bfe728b1ef7fc41f47b7fa4ca01ef7fe +100cfe12ff006da75f88970b4ea2fc852a4e88fb86a7466d622be521be5ab98523ac7f0b21648ed4e52a591cd2d805f9964ca8e0b83e503d79966f50dc0bd26f +a83d1a96885ec6f8978b8b16912248d6b1122abc434dfd095a498e8c54ee864a3bd457b116f09ed2fb21c54ab168350ff12b27cebea53d81051043975300bb51 +999345092c84f28330c75876435eb8260bc01e605c2e183044b8a60458c572e63c13da353dc403097c4a76107883c1021a22e6611790992753c896e44cf5aba9 +1fa216094904236f31846e62ba865704e489c910b51299c4af5348458c449885f246505b4251fc881b090738778210c41070d44bc45e6386483d677263151c16 +41ed0eb10c36466a0ed1259166a013094d88a6c83e210c4b2c6e12e67040992585826462a3ac5a84b89527ffd9}}}}} + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s2\ql\nowidctlpar\hyphpar0\sb200\sa120\keepn\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs32\alang1081\ab\loch\f4\fs32\lang1033{\listtext\pard\plain }\ilvl1\ls1 \li936\ri0\lin936\rin0\fi-576\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{{\*\bkmkstart __DdeLink__109_736781840}\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af12\rtlch \ltrch\loch\fs21\loch\f8\hich\af8{\*\bkmkend __DdeLink__109_736781840} +Maecenas non lorem quis tellus placerat varius. Nulla facilisi. Aenean congue fringilla justo ut aliquam. Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. Morbi viverra semper lorem nec molestie. Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.} +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af12\rtlch \ltrch\loch\fs21\loch\f8\hich\af8 +In non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.}{\rtlch \ltrch\loch +In eleifend velit vitae libero sollicitudin euismod. } +\par \shpwr2\shpwrk3\shpbypara\shpbyignore\shptop0\shpbxcolumn\shpbxignore\shpleft2819\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch +{\*\flymaincnt5\flyanchor0\flycntnt}{\shp{\*\shpinst\shpwr2\shpwrk3\shpbypara\shpbyignore\shptop0\shpbottom2660\shpbxcolumn\shpbxignore\shpleft2819\shpright6819{\sp{\sn shapeType}{\sv 75}}{\sp{\sn wzDescription}{\sv }}{\sp{\sn wzName}{\sv }}{\sp{\sn pib}{\sv {\pict\picscalex100\picscaley100\piccropl0\piccropr0\piccropt0\piccropb0\picw200\pich133\picwgoal4000\pichgoal2660\jpegblip +ffd8ffe000104a46494600010101004800480000ffe20c584943435f50524f46494c4500010100000c484c696e6f021000006d6e74725247422058595a2007ce +00020009000600310000616373704d5346540000000049454320735247420000000000000000000000000000f6d6000100000000d32d48502020000000000000 +00000000000000000000000000000000000000000000000000000000000000000000000000000000001163707274000001500000003364657363000001840000 +006c77747074000001f000000014626b707400000204000000147258595a00000218000000146758595a0000022c000000146258595a0000024000000014646d +6e640000025400000070646d6464000002c400000088767565640000034c0000008676696577000003d4000000246c756d69000003f8000000146d6561730000 +040c0000002474656368000004300000000c725452430000043c0000080c675452430000043c0000080c625452430000043c0000080c7465787400000000436f +70797269676874202863292031393938204865776c6574742d5061636b61726420436f6d70616e79000064657363000000000000001273524742204945433631 +3936362d322e31000000000000000000000012735247422049454336313936362d322e3100000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000058595a20000000000000f35100010000000116cc58595a20000000000000000000000000000000005859 +5a200000000000006fa2000038f50000039058595a2000000000000062990000b785000018da58595a2000000000000024a000000f840000b6cf646573630000 +00000000001649454320687474703a2f2f7777772e6965632e636800000000000000000000001649454320687474703a2f2f7777772e6965632e636800000000 +00000000000000000000000000000000000000000000000000000000000000000000000000000000000064657363000000000000002e4945432036313936362d +322e312044656661756c742052474220636f6c6f7572207370616365202d207352474200000000000000000000002e4945432036313936362d322e3120446566 +61756c742052474220636f6c6f7572207370616365202d20735247420000000000000000000000000000000000000000000064657363000000000000002c5265 +666572656e63652056696577696e6720436f6e646974696f6e20696e2049454336313936362d322e3100000000000000000000002c5265666572656e63652056 +696577696e6720436f6e646974696f6e20696e2049454336313936362d322e310000000000000000000000000000000000000000000000000000766965770000 +00000013a4fe00145f2e0010cf140003edcc0004130b00035c9e0000000158595a2000000000004c09560050000000571fe76d65617300000000000000010000 +00000000000000000000000000000000028f0000000273696720000000004352542063757276000000000000040000000005000a000f00140019001e00230028 +002d00320037003b00400045004a004f00540059005e00630068006d00720077007c00810086008b00900095009a009f00a400a900ae00b200b700bc00c100c6 +00cb00d000d500db00e000e500eb00f000f600fb01010107010d01130119011f0125012b01320138013e0145014c0152015901600167016e0175017c0183018b +0192019a01a101a901b101b901c101c901d101d901e101e901f201fa0203020c0214021d0226022f02380241024b0254025d02670271027a0284028e029802a2 +02ac02b602c102cb02d502e002eb02f50300030b03160321032d03380343034f035a03660372037e038a039603a203ae03ba03c703d303e003ec03f904060413 +0420042d043b0448045504630471047e048c049a04a804b604c404d304e104f004fe050d051c052b053a05490558056705770586059605a605b505c505d505e5 +05f6060606160627063706480659066a067b068c069d06af06c006d106e306f507070719072b073d074f076107740786079907ac07bf07d207e507f8080b081f +08320846085a086e0882089608aa08be08d208e708fb09100925093a094f09640979098f09a409ba09cf09e509fb0a110a270a3d0a540a6a0a810a980aae0ac5 +0adc0af30b0b0b220b390b510b690b800b980bb00bc80be10bf90c120c2a0c430c5c0c750c8e0ca70cc00cd90cf30d0d0d260d400d5a0d740d8e0da90dc30dde +0df80e130e2e0e490e640e7f0e9b0eb60ed20eee0f090f250f410f5e0f7a0f960fb30fcf0fec1009102610431061107e109b10b910d710f511131131114f116d +118c11aa11c911e81207122612451264128412a312c312e31303132313431363138313a413c513e5140614271449146a148b14ad14ce14f01512153415561578 +159b15bd15e0160316261649166c168f16b216d616fa171d17411765178917ae17d217f7181b18401865188a18af18d518fa19201945196b199119b719dd1a04 +1a2a1a511a771a9e1ac51aec1b141b3b1b631b8a1bb21bda1c021c2a1c521c7b1ca31ccc1cf51d1e1d471d701d991dc31dec1e161e401e6a1e941ebe1ee91f13 +1f3e1f691f941fbf1fea20152041206c209820c420f0211c2148217521a121ce21fb22272255228222af22dd230a23382366239423c223f0241f244d247c24ab +24da250925382568259725c725f726272657268726b726e827182749277a27ab27dc280d283f287128a228d429062938296b299d29d02a022a352a682a9b2acf +2b022b362b692b9d2bd12c052c392c6e2ca22cd72d0c2d412d762dab2de12e162e4c2e822eb72eee2f242f5a2f912fc72ffe3035306c30a430db3112314a3182 +31ba31f2322a3263329b32d4330d3346337f33b833f1342b3465349e34d83513354d358735c235fd3637367236ae36e937243760379c37d738143850388c38c8 +39053942397f39bc39f93a363a743ab23aef3b2d3b6b3baa3be83c273c653ca43ce33d223d613da13de03e203e603ea03ee03f213f613fa23fe24023406440a6 +40e74129416a41ac41ee4230427242b542f7433a437d43c044034447448a44ce45124555459a45de4622466746ab46f04735477b47c04805484b489148d7491d +496349a949f04a374a7d4ac44b0c4b534b9a4be24c2a4c724cba4d024d4a4d934ddc4e254e6e4eb74f004f494f934fdd5027507150bb51065150519b51e65231 +527c52c75313535f53aa53f65442548f54db5528557555c2560f565c56a956f75744579257e0582f587d58cb591a596959b85a075a565aa65af55b455b955be5 +5c355c865cd65d275d785dc95e1a5e6c5ebd5f0f5f615fb36005605760aa60fc614f61a261f56249629c62f06343639763eb6440649464e9653d659265e7663d +669266e8673d679367e9683f689668ec6943699a69f16a486a9f6af76b4f6ba76bff6c576caf6d086d606db96e126e6b6ec46f1e6f786fd1702b708670e0713a +719571f0724b72a67301735d73b87414747074cc7528758575e1763e769b76f8775677b37811786e78cc792a798979e77a467aa57b047b637bc27c217c817ce1 +7d417da17e017e627ec27f237f847fe5804780a8810a816b81cd8230829282f4835783ba841d848084e3854785ab860e867286d7873b879f8804886988ce8933 +899989fe8a648aca8b308b968bfc8c638cca8d318d988dff8e668ece8f368f9e9006906e90d6913f91a89211927a92e3934d93b69420948a94f4955f95c99634 +969f970a977597e0984c98b89924999099fc9a689ad59b429baf9c1c9c899cf79d649dd29e409eae9f1d9f8b9ffaa069a0d8a147a1b6a226a296a306a376a3e6 +a456a4c7a538a5a9a61aa68ba6fda76ea7e0a852a8c4a937a9a9aa1caa8fab02ab75abe9ac5cacd0ad44adb8ae2daea1af16af8bb000b075b0eab160b1d6b24b +b2c2b338b3aeb425b49cb513b58ab601b679b6f0b768b7e0b859b8d1b94ab9c2ba3bbab5bb2ebba7bc21bc9bbd15bd8fbe0abe84beffbf7abff5c070c0ecc167 +c1e3c25fc2dbc358c3d4c451c4cec54bc5c8c646c6c3c741c7bfc83dc8bcc93ac9b9ca38cab7cb36cbb6cc35ccb5cd35cdb5ce36ceb6cf37cfb8d039d0bad13c +d1bed23fd2c1d344d3c6d449d4cbd54ed5d1d655d6d8d75cd7e0d864d8e8d96cd9f1da76dafbdb80dc05dc8add10dd96de1cdea2df29dfafe036e0bde144e1cc +e253e2dbe363e3ebe473e4fce584e60de696e71fe7a9e832e8bce946e9d0ea5beae5eb70ebfbec86ed11ed9cee28eeb4ef40efccf058f0e5f172f1fff28cf319 +f3a7f434f4c2f550f5def66df6fbf78af819f8a8f938f9c7fa57fae7fb77fc07fc98fd29fdbafe4bfedcff6dffffffdb00430005040404040305040404060505 +06080d0808070708100b0c090d131014131210121214171d1914161c1612121a231a1c1e1f212121141924272420261d202120ffdb0043010506060807080f08 +080f201512152020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020ffc2001108008500 +c803011100021101031101ffc4001c0000010501010100000000000000000000020001030405060708ffc4001a01000301010101000000000000000000000102 +0300040506ffda000c03010002100310000001cef90fb09a72952d6a75cbea4d4e63c8fb5e46b47b7ac13eefbbc600d1ab0076056ce0100416acafcf72f764c5 +b2c88cb7775e3dda7264f5754ed3360db03601aa1102d6c81bcfca398411c56ce32d9d4101c847bb834e8bbcea88916beadd3e3db6907534b4989384bd3c6b57 +b1dcfd09846af1ab3660044e582d96ce0b8c977982f6f3d37b5257036b9fabd4fd0f0a4558fa5e5acf3a5d3e2d4eecbbc3604bdbb9a4db0e6118183157c160f8 +ac5d5a347f15a74410725d6263afe6eaf48eef09d7074996aa0a7cea3eb705d52c4e9e4f68e0af54a83b33289472ae558e432c5c3514af8776343ced624d7f9f +a3d1a27b2b798614fb94983918fcddbe1fd74c0e9e4d74deede75e4647644513222110b640a0d842bc5deb940e0149f97b7dab8d374f1985b3e945d821a0957c +a376f95fa3c359e5ed1e7f5773006e8e5132b156216c81607917b60d2a294e7185b8dbd53cdbc926255e8bdaf1d6c2a6347c89f57cf3ea72e3db9b690fbf799d +368cdd95ca265121608e056f34e8b516a62837675ea796fa5e67a4b9fa6ca6f41fa7f91e7bccf5f3d9f03ba16d27c2d5312d0b197b291d645264e9d652ba065a +6af66b2a894f30e8e9c12dae94b894e8b92d67cef4adcab6267b3f67e7208d2bad7396f84dabbcb61277424f49d5a2d869a74074899200d18311192691ea47a9 +5453a1e6b75ca97edcd61e3ccf93ee36c8e00d09d1b4e85a117571eaa248c8656ab2e55467d044da33a326916bd36d293cf3aef21e9001471538fc3e857d5061 +015379d829436c2eff003e769c84390442da26d20d608ead92cbeb20a9b20c4ac919908ccf21e7fbce529d6557a79bb6bf0d20451dcace54ca995321c81d9308 +b1a28d9aadaef3d1759f31024aee19d779d799ee994ad49e7d274a9372b6996f64be8b7535802719f013a225866d9d73111632309e82d93054684df84e1f55ca +46c80c88a8b2d8cb315972332915b886da9914b6cfb2c61c63c50cc0be311780b366c74acd95ca8b21e05945d642b3653c0b006d19138d710ce8c810c6224766 +0483163097af9b9971a73d2e08a8b2cb84a01b239078160632d85b44c076b2bae4d9958098f173833572f51cff00ffc400271000020202020005050101000000 +0000000102000304110512101314152120222330310632ffda0008010100010502af2d5695cb5d26526ce5d7af51597af22a09996abbf1aea178c707337fa6eb +7a0b73ea42f9f6bc2db338bab55388bc6e341c7e34f6fc79edb8f3dab167b56346e130ccf65c5598bc7d78d67e9e62deb429d9dcdcabee7c44eb5b41e0cea82f +e5f168193fe8ec2389cff5b87fb39bbbf3d66769b982bdb2291aa8c1e19950bb1eea4d76389c2e6fa4cf0db1fa9ce9390b7cce401d0ed17e4f169b71f00c1e0c +36398c6eb1c7c3ec1e173c65e16e6ff4e5375aafa6c1779b15a24e213f10fa732916d194a68b9da70f9de93391c3afe9cf7d56ca1a5b88ad2ca1e995640338f4 +e947d2dfcff418718cdce073fd462fe9cfb3ee8cc00bed16b5184ad70b854bea84f553b09dd67710b4cda45d466d268c99c6659c4cda6e4b2bee27713b4dcd89 +b13719beccdb8b65f9c105b9165cd8f474984377dcdf954f835408b0148d7f90afcbdd64a33b2ae4e4b1fb5c38d768386b8cc3c6cfc756f702455c896ae96ea4 +6a7c18287f36c53d2c3d71f27cc46d59735552a0dce3ff00e06364315c6b84f4f6c366e7c464ada3a56b2fed282cb120eb0d9d51722d8320c39680faca8c3954 +439b54f5f5cb6faecacd8b3ad6d0e35261c1130686088ee816c46f0dcdc0e561b0c2e67dd2f6b02d64ec749f6cd88f5abcb53cbf1dee323b0eb9ca557334b4e4 +7508c138dfe7c19d04d389b84fcf710dc04acb5a3a58a2d606b1c960ac1cb7193de38b9ef5c64f7bc09ef58f0729483ef16caf90e46d38eb94cb563574c2cb37 +3e3c35353460b277967f31f1bcf643522e552f90130eae9e8717630f1c4f4f8f3c8a27934cf2699e5d027998c91b35046cdc833d466f9899d5c5bd1e6c78ee6e +6ccdec16962069d6043111a00c225b6082f58b6033737a9e609e66e7e49f93c3e2796860362c195608996a63d8e5aab3ecdf81f99a86a0604eb06a09a9d60dac +5bf5058ad3e3c373b426769b9b3e0c81a75759e75cb036fe9fecd6a0fa089f316c6116f9d819a9a3e3b80cdc30995b180fd0209a83e8d4226a76222d8606dc33 +535e061319a7ffc40029110002020103030402020300000000000000010211031012131420210430314122324051425271ffda0008010301013f01e75b459059 +50b244e58d8e71334d367a56a8f54ed32bd9c58b77c8b11b628b123d5cabc1063f5733abc875990eb6675923ac97f473a7f4473a5f44b2a6bdac3e116596633d +4cae4458f48c1cbe05e95ff90a38e3f08cb15fb445dd6597d88b2cb3e206577310f4f4f3db224a8911ff0051c6bb6cb2cb2cb225963647cb32ba88c43d22c8cb +7c2c648fdd595d97a5e965912d9bcdc6247a97f88fb704e9d13f0c9331ce9d0c631f7c74a2a8c534677e6bba24ff002858c6467ba23efbd13132cbbf04511c5c +96ce94e94dacd8cd8c51313fa322a6331ca98fbdca84c4597645511661f102f48e46998da912499e2bc11b26ace338f56cdcc7214af4cdfa98dfe3aa1310b241 +7d9cd0fece7c7fd8b1d1ff0004e421c91bc721b1f91c46b47a58dd884c421116bec9414be096392d28a1c14858d0e28a4645e0b1fb5626290d9e4de5c5fc95ae +e1e48af937c5fd925e28716b4a36336338d9c6718e2977d965a290c64e7b476fe48cf69d448ea247348e591c9239246f917236c99c7238cd838b2bb28a1bd18f +4a286b4dc5eb456946d28a36238cd86c66dd1fbf5a5695a228da5e8fddb2fd9bfe4fffc400271100020201030305010101010000000000000102110312132110 +3031042022415132714061ffda0008010201013f019fa66e64bd3325e9e42f4f33624912c523d2637147ac5f230aa5da9cabc0db63b12ae9815cac92349b66d2 +364d9364a92fb256fc9cf6a4ad92545144f846055024210e5a4d6df81636fcb3f87a5f7631268d2513e59055148911e993f48cac4ccb1b565f6a86254868d232 +ae5d18844910e1d084648e8976e86b8a1e2fc1c4998d7cafa3285d26ab921cab1232c3547b715cf4b387e4c985a22bd9432460953a174cd0d2fb504328aa272e +192969e0dc1e435a3711ad0e478766376ba648ea4557bd2b1aa20b828aa2465f06456c6868960543e04acc78235726648417831c8dc46ea2528485a0f81a50a0 +38d74c5fd13fe855d19465f234c7063848795b3fd383511ff492ff00d28a170596722b39341b6cd2fecd2726a66b3273e08b6852470596597d71792bd898bd96 +7c075fa3a18d53e95eca349a19a24478628d9b4cdb34234c4f81aa3fa5c054fc15d1f5a1a348e2fa210a3621ad46cc4d98fe9b31fd3661fa6d43f4d981b58cdb +c68bc68de88f39ba9fd8a66b2d76132cd46a2c691a0aaeb451c0ebaea685919b86b1648fd8e4bebda99e7ad96793495dab351657bafdb7d290e3daa1aff81c7b +147fffc40037100001020304080305090100000000000001000203112110223132041213203041517105618123339192a134404450627282b1d1e1ffda000801 +0100063f02ca665652b036628de46ab1512478545576b1574ea8b5bf3599164591645902c817bb0aeb648b982a78520713bbf4dc9b96799f24760c97741e7373 +e2b61f4aee3021e75dc737a84e6bacd471f671388544f2a6e39dd0296eed00b263109b337db43c329d10733357a96877533de2d4e866c13371f42811c222d98c +100534746efed982d0c71bcda70836caad935426cb9a975aee636b9a5399635d3ba6850703c09a701c954ad56299c56b72689a23a53708aa2f72f6508a94766a +29b44d652a8d280ab82a43430015ec6dd6d6a5ae76ccd55682d73bcd4f665642b2158aaaab66a90fe8a90dc7b291d0dcab064b0540aac0b2a92aacca86c9b5f5 +1c955640561257627c56a8a96a93db3dcc563662b321227d148cfe3b955564c750ba85fed0aeaa8d3eaae43791d94e2682f1e6da2d6911e4f0b5ccbd13a2c8c8 +d2da1b3c9557557184a996482241527c6d4eed2bed7f0615efe21ed08afc43bb4357745d31dfc153c374b3e8a70fc0e2cfd02bbe06ff009d4a1781fab9ca7a4b +2143fd2caabbae7f7bcbbfb5592af00a9c58a18cef5283585a02019a4ecc04369126eea1722b92e4b95b8aff008aec37395c820775388dd76f4064af35d0fb85 +4703c19852598ac4aa2a99aad15372eae4ab6cc2bb10faabc26ab4536b95ec787457adc38375df70afe49fffc400281000030001030304020301010000000000 +00011121314161105171208191a1b1c1d1e1f030f1ffda0008010100013f21350b67f11946af629e589cb0d081f4acc914840cd036363f53549a37b9ed0742d9 +e0c759bac6d3c1a175adf8e8a2b47c1d90e30dbb3e0ffc01cc35bf88fd043766e9b0d8dfaf4e0e06f8285d2b3f22302d122bd28bb22e479a8adb2119c3656eca +70bd98d8fa31b2fa9b8a97ab4518af4e93ca159ccfd06f18572c323472e5c2f86216d32946c6fd7745a275309a2a63078105ee6226d8e84ea1f6b086e6658ca9 +fa0084bd6fa6af819836dcc29c2d09794e8f83225facb4fb18a242e8f52b830431e0cc3178301a55a8bff1e56c15aa16bc19a7034b0cb2acaaf96265213a2d0c +4b69af49b2756a8eee784c5e9a5296a3928c2da0b29b753b2468b979c9f8fd0bbc4f725af495d188b04d56a1ca58b81b35f041265a68e53948ee41cc5b71a255 +884c1806301d12d87f717baee3235941b99fd0584f032aa3a450d6178d847715eb0813685b98bf73f8cc6a19914d9b197535dd94eee4516a84eb44b0ab7b12ec +79379c8af2dab52376f68d207112435bd1de64a455e4d60909b20f818a310d01f608f7c2217bc93486bf9670508741b16bf186f59291865d19195028b2b1e9ba +c861b0740b3ca1bf94bb9a7b78b1cdae105ac41b6d77625c15ba34a8636868a3a411455e4677e84dbe45802a1acd72317f627da8e1468ecf71e97eaef23cab16 +a269e4b5ca9cdb007a0978359f935135b27ec5e169921e40370a497217ba29065ca94d6b50e74dec7414cd062fd435ea9e4d29a0ccd42d7036af2356ccbe25c0 +e32cb6fd0423aff8f8124c27a7bc099f426bf7326a2a5b90abadbeed3f826c2e025f26507ee37c96db35eff900ac4d5b1c1d092d887aa38b185387ba1a771ee6 +a9fd1859181558e893171b3a256b125772824cad36ce61a1af8a33dd5ec7f9412fb7d1e1f92aad7e4e27b5185f14a1f7d21e4c1e95271feb22ef8f627956cc57 +b929ca409cb0f28bd4c8f587d0471911453f229334e480d8c6c1c9c80d4744e2860778d0f413e469968124da57b8db6a36211f7583631db23169f03b81c8c98a +3268522b794f27784bb191289b1da44190c5f5cd121a73237664482920cfa0d2e8347a1c827dce00dc23d6e4904c793429204fa3a15137b928d223c1cb8878c3 +a375a31a87ca141f507eb2b746ba309485d48592040c2d0be9d88431a8c21878e48fffda000c030100020003000000106b0631a8f6e44e0c76cfc6a197c34b9c +d5a5e0fb54038412dabf4d200da8ebcf66ea2642070e2ecb9e6ef3daaf46c64184b704900e32fd4ed133cd5d6d80b812412cbe809ae198e77100949233ff002e +20ddb23398c9b0917cb5da4fe182bd6b94f1d5ab100351287c7902e5b68139144dd8dc3e3dfbc7e8199d0098cd819fcd9cdd48ab2249280e782fc18ff9527a96 +29a4292210f465cc52715aba0ea39c6ebfffc400271101010100020202010304030000000000010011213110415161207191f03081b1d1c1e1f1ffda00080103 +01013f104389cc21c966bb8b6ddacdbb30e6e1760317682083f226bd220c3823aa4ead38b03fa0ff009f083a09fabc1fa37d65a708bdc374189867bb39820fcb +21cfe2c672e5900fd5f13966f2139790bd8fdce02e184441e37f003e0359e01702e4ce9c8e66d0fae3c5cd93bee548c266afa7fcda31820ba967f002116b1cae +5c401e28190d75f05360df682ce5bdd2504f71e0b2ccdf0d8847cc03a472e6757b9b1c1fa7efff0050b2596db9974cd62e4b9b74dc65294b2db6c30ca1b2cbe4 +4c0b7112c96d89e3708f64e79653ee52cb2cb6db107cc11db166992afecff3f7956a1babebb1ec9336f6adf2571cf4ca665996d806b6c693e2316f81e27b2e27 +df3fbc4610c28c4625c382f62678cfce4c08632d91c7814eed516ba5b08788d679e0e667cc46407d2c3d6ec246905c6cb4d5847521f5690c0507cc07b9c81663 +a10cb0624b3af0294a224575e37773bee79e463d486a41f57c39f8eed92b64cb2db6cbe46123ede5053211d42e98e82dac664b938bb865bb0e522a771236dfe3 +fead3ff1ff0057f2e7c1ab5768ce7af1b16c30afb453e47ac1acd36777399d38253d4bfa95f536c296deaf84c27647cdb21c42ecf036186356bc0bb2b94e4891 +ebc0697dac3259b6885e00f86acbddadf56d90109ee594b2ccccc925996dbe0b3c02d41f3620b84060782ca6d999667c659e062134f231110c2b65999966dfe8 +0c30dbe06186dbffc4002511010101000203000104020300000000000100111031214151a120618191b1d1c1e1f1ffda0008010201013f10eede090f86cde188 +f522993dcc8e692bb9c16df12cf19659270fe33c87715e178409e6daf81ff5c06bb6036d03d36becfb6c7460fbac5870f1965924927058274c709b8fdff88c61 +1f6ba664e8d58599e72cb2c9249b4f3fa20e05fd99fdff00d42110cf08009c5e3fd91b2db780b2cb2c9261f1c26cce08e118897cf01c3727e45e2d9cf32a1e9b +620820b2cb2c9274842e92463c30bf86e144125a31eac10e2ebbb20fb11041671964969772ed7a45b89642fde18390e9790f00bce9d30701071965925dec2d3d +465aca26c40fe5261b2e15a0dc9f00b098237252c6218638c25584cb1b133bb116c3a7db6794dff3e61acb6789312be7b00796cbdd9f6f1c09e586e9d41eac5b +3d65e1e5115347867833a3f69cee5f0caf575accfd96badbcbb82d59d3b0e6c1c3f90fa12f9914a60dd1bc935ead1eecedfb26b5f6bc026df0c2b82a19698585 +90e5919a5911213de98888076cabb603a8b599fd5e4e136193931d3abc7026143f46c91a93c197af01f6ff008ff71ee7f25876fe4b07b3fbbc38fe565ece5384 +895f778982cfc5ea58c2f58bcac1e0bacf51f68fb22d8c3ff75fbbf982f7f9b56afe63af927d30fa9ec43f4dfbad271e999725d8b320d9cee17636a7ee0bdcff +0036bd4bedc6c6a71df05e9c9d46f623ea2e4c16c84478eb852d8ec6c42a9da7e251df1bfa326ce0642f76271645bc1e235c78b3e43f6268cb4a489c6fe878ce +05267031c8f1bceb0c360c1271b6f19044ffc400261001000202010305000203000000000000010011213141516171108191a1b1c1e1d1f0f1ffda0008010100 +013f106a73081d65206d754f9e528f6a31aa8a2417d25049ef0c54150cf760d9b8d3da69f12ff497a1083061b1dc4ea2a1babdc7d60e1e5f78b969955bb8c17c +acb2b91f134fd21e710016fb200a3e385383f69b3f8517b53f647b0a1e84b0b8b059901a9b0b2e817ea0bea420f58a4c3823486dfc8f7b6b65611c16e282ad41 +19940776a3fb25a306218e8a7955411a7ba1a06ca6d7f04d18d4e0852ac688bd07d443d2e7a08075ac0f2d1f8ca2cd46d45cb20eb0ea2cf8c67f89627917bebe +aa0b187042585d2186a5dc2c05e652c28971c152e3858611136465f4192199999832e0c2b9ac4c7a49ceabfbb950b8fb6287494617479963f2574800f68ac954 +09783a605f03907112399429a85b12565a615e6ce630c36f41a95a8b2e5c194a3a6fd414000f55b9bb0b1981613acc65db29de86f67f83efd16c4a48220950f5 +a554099b5972455371298c1781e18004036730bc1bf55972e5cb9443a7de56098841cc5d79911e91f4e6acc5aa03f90fe7a54a8461182d89b65ce86c88ac30c8 +962710ac42a5729c3e8692e2e22cbf51a0b0ccd9542cadc0c4036cca42a44a7c4bcd6c1b7eae5460cf6b81f16f79c28c18870d546bd23acdccb13307c59ab942 +42df18cb2f1820de29e6186122303dc09c65bc23d6947183282b2f451084e2a5f8a9f76e2b35d6ae38b4d5e04afce655b61ae513f0fd967e5575d453f631516f +a06e2ba8e405e298e834be60758d11b85310e3710b2a106d48a8b5ce5091017820850bd584af37a038250c9e59596f726ea68f8ce66dd656058a7497b36d406e +1a45a95728226bd721cc1c415d8a5825059302b022e3761d0387bc52e2594ab58b51531a9530f78d4a08232c15fb91101744b218b788540f7a181e68f50f961b +bf662621fad162bb827544c5ae521f1a1a5ea092d4a4ea471431d259e9f80cc415172d6a16e501a5973f3a992b7804b4546570a2e2cfd90ce1e4e2c4bf8b9a97 +8aadf785dd6bb12985f45e21535f2c3c08f0c7758f94e2bf12ed50ee131c55d897e308bb1fd4b6fcf65528176bde07af9a5e501ed0ce068e122ceed9f83679d4 +6b0484094ecde6215300ad41d7fac4c933b6781b8a42f217e046258aacdbe496e20bb1b672d388e62054001cda95de51511558e3748665ca8cf4581969ced4f6 +96590f78a6d5792e3fc119468afa89b9d8b648bb674da00d95e0b8ab869188f4bd40fdf68003e61d3460a2b59a8ad18ca6fd930ccbdc7e88a63c95fccb74bc0b +f6156475c2cd48991b175b90edbadb71e0081581c15742794109341b05e5d07b0cbe97b5ff00b595ed53722e09798bc63f0090b180f6250d08e82cb0643ce670 +d18cb7c29cc23b12b657de39758e344c8103a071e61775a1ff00d599a4401dd36e4ed070095f113350f1275bf92a9156bfe728b1ef7fc41f47b7fa4ca01ef7fe +100cfe12ff006da75f88970b4ea2fc852a4e88fb86a7466d622be521be5ab98523ac7f0b21648ed4e52a591cd2d805f9964ca8e0b83e503d79966f50dc0bd26f +a83d1a96885ec6f8978b8b16912248d6b1122abc434dfd095a498e8c54ee864a3bd457b116f09ed2fb21c54ab168350ff12b27cebea53d81051043975300bb51 +999345092c84f28330c75876435eb8260bc01e605c2e183044b8a60458c572e63c13da353dc403097c4a76107883c1021a22e6611790992753c896e44cf5aba9 +1fa216094904236f31846e62ba865704e489c910b51299c4af5348458c449885f246505b4251fc881b090738778210c41070d44bc45e6386483d677263151c16 +41ed0eb10c36466a0ed1259166a013094d88a6c83e210c4b2c6e12e67040992585826462a3ac5a84b89527ffd9}}}}} + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af12\rtlch \ltrch\loch\fs21\loch\f8\hich\af8 +Fusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. Cras fringilla ipsum magna, in fringilla dui commodo a.} +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af12\rtlch \ltrch\loch\fs21\loch\f8\hich\af8 +Etiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.} +\par \pard\plain \s1\ql\nowidctlpar\hyphpar0\sb240\sa120\keepn\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs36\alang1081\ab\loch\f4\fs36\lang1033{\listtext\pard\plain }\ilvl0\ls1 \li792\ri0\lin792\rin0\fi-432{\rtlch \ltrch\loch +Lorem ipsum dolor sit amet, consectetur adipiscing elit. } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af12\rtlch \ltrch\loch\fs21\loch\f8\hich\af8 + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af12\rtlch \ltrch\loch\fs21\loch\f8\hich\af8 +Nunc ac faucibus odio. Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.} +\par \pard\plain \s2\ql\nowidctlpar\hyphpar0\sb200\sa120\keepn\ltrpar\cf17\b\dbch\af9\langfe2052\dbch\af13\afs32\alang1081\ab\loch\f4\fs32\lang1033{\listtext\pard\plain }\ilvl1\ls1 \li936\ri0\lin936\rin0\fi-576{\rtlch \ltrch\loch +Maecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. } +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af12\rtlch \ltrch\loch\fs21\loch\f8\hich\af8 +Maecenas non lorem quis tellus placerat varius. Nulla facilisi. Aenean congue fringilla justo ut aliquam. Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. Morbi viverra semper lorem nec molestie. Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.} +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \shpwr2\shpwrk3\shpbypara\shpbyignore\shptop0\shpbxcolumn\shpbxignore\shpleft2819\pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch +{\*\flymaincnt5\flyanchor0\flycntnt}{\shp{\*\shpinst\shpwr2\shpwrk3\shpbypara\shpbyignore\shptop0\shpbottom2660\shpbxcolumn\shpbxignore\shpleft2819\shpright6819{\sp{\sn shapeType}{\sv 75}}{\sp{\sn wzDescription}{\sv }}{\sp{\sn wzName}{\sv }}{\sp{\sn pib}{\sv {\pict\picscalex100\picscaley100\piccropl0\piccropr0\piccropt0\piccropb0\picw200\pich133\picwgoal4000\pichgoal2660\jpegblip +ffd8ffe000104a46494600010101004800480000ffe20c584943435f50524f46494c4500010100000c484c696e6f021000006d6e74725247422058595a2007ce +00020009000600310000616373704d5346540000000049454320735247420000000000000000000000000000f6d6000100000000d32d48502020000000000000 +00000000000000000000000000000000000000000000000000000000000000000000000000000000001163707274000001500000003364657363000001840000 +006c77747074000001f000000014626b707400000204000000147258595a00000218000000146758595a0000022c000000146258595a0000024000000014646d +6e640000025400000070646d6464000002c400000088767565640000034c0000008676696577000003d4000000246c756d69000003f8000000146d6561730000 +040c0000002474656368000004300000000c725452430000043c0000080c675452430000043c0000080c625452430000043c0000080c7465787400000000436f +70797269676874202863292031393938204865776c6574742d5061636b61726420436f6d70616e79000064657363000000000000001273524742204945433631 +3936362d322e31000000000000000000000012735247422049454336313936362d322e3100000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000058595a20000000000000f35100010000000116cc58595a20000000000000000000000000000000005859 +5a200000000000006fa2000038f50000039058595a2000000000000062990000b785000018da58595a2000000000000024a000000f840000b6cf646573630000 +00000000001649454320687474703a2f2f7777772e6965632e636800000000000000000000001649454320687474703a2f2f7777772e6965632e636800000000 +00000000000000000000000000000000000000000000000000000000000000000000000000000000000064657363000000000000002e4945432036313936362d +322e312044656661756c742052474220636f6c6f7572207370616365202d207352474200000000000000000000002e4945432036313936362d322e3120446566 +61756c742052474220636f6c6f7572207370616365202d20735247420000000000000000000000000000000000000000000064657363000000000000002c5265 +666572656e63652056696577696e6720436f6e646974696f6e20696e2049454336313936362d322e3100000000000000000000002c5265666572656e63652056 +696577696e6720436f6e646974696f6e20696e2049454336313936362d322e310000000000000000000000000000000000000000000000000000766965770000 +00000013a4fe00145f2e0010cf140003edcc0004130b00035c9e0000000158595a2000000000004c09560050000000571fe76d65617300000000000000010000 +00000000000000000000000000000000028f0000000273696720000000004352542063757276000000000000040000000005000a000f00140019001e00230028 +002d00320037003b00400045004a004f00540059005e00630068006d00720077007c00810086008b00900095009a009f00a400a900ae00b200b700bc00c100c6 +00cb00d000d500db00e000e500eb00f000f600fb01010107010d01130119011f0125012b01320138013e0145014c0152015901600167016e0175017c0183018b +0192019a01a101a901b101b901c101c901d101d901e101e901f201fa0203020c0214021d0226022f02380241024b0254025d02670271027a0284028e029802a2 +02ac02b602c102cb02d502e002eb02f50300030b03160321032d03380343034f035a03660372037e038a039603a203ae03ba03c703d303e003ec03f904060413 +0420042d043b0448045504630471047e048c049a04a804b604c404d304e104f004fe050d051c052b053a05490558056705770586059605a605b505c505d505e5 +05f6060606160627063706480659066a067b068c069d06af06c006d106e306f507070719072b073d074f076107740786079907ac07bf07d207e507f8080b081f +08320846085a086e0882089608aa08be08d208e708fb09100925093a094f09640979098f09a409ba09cf09e509fb0a110a270a3d0a540a6a0a810a980aae0ac5 +0adc0af30b0b0b220b390b510b690b800b980bb00bc80be10bf90c120c2a0c430c5c0c750c8e0ca70cc00cd90cf30d0d0d260d400d5a0d740d8e0da90dc30dde +0df80e130e2e0e490e640e7f0e9b0eb60ed20eee0f090f250f410f5e0f7a0f960fb30fcf0fec1009102610431061107e109b10b910d710f511131131114f116d +118c11aa11c911e81207122612451264128412a312c312e31303132313431363138313a413c513e5140614271449146a148b14ad14ce14f01512153415561578 +159b15bd15e0160316261649166c168f16b216d616fa171d17411765178917ae17d217f7181b18401865188a18af18d518fa19201945196b199119b719dd1a04 +1a2a1a511a771a9e1ac51aec1b141b3b1b631b8a1bb21bda1c021c2a1c521c7b1ca31ccc1cf51d1e1d471d701d991dc31dec1e161e401e6a1e941ebe1ee91f13 +1f3e1f691f941fbf1fea20152041206c209820c420f0211c2148217521a121ce21fb22272255228222af22dd230a23382366239423c223f0241f244d247c24ab +24da250925382568259725c725f726272657268726b726e827182749277a27ab27dc280d283f287128a228d429062938296b299d29d02a022a352a682a9b2acf +2b022b362b692b9d2bd12c052c392c6e2ca22cd72d0c2d412d762dab2de12e162e4c2e822eb72eee2f242f5a2f912fc72ffe3035306c30a430db3112314a3182 +31ba31f2322a3263329b32d4330d3346337f33b833f1342b3465349e34d83513354d358735c235fd3637367236ae36e937243760379c37d738143850388c38c8 +39053942397f39bc39f93a363a743ab23aef3b2d3b6b3baa3be83c273c653ca43ce33d223d613da13de03e203e603ea03ee03f213f613fa23fe24023406440a6 +40e74129416a41ac41ee4230427242b542f7433a437d43c044034447448a44ce45124555459a45de4622466746ab46f04735477b47c04805484b489148d7491d +496349a949f04a374a7d4ac44b0c4b534b9a4be24c2a4c724cba4d024d4a4d934ddc4e254e6e4eb74f004f494f934fdd5027507150bb51065150519b51e65231 +527c52c75313535f53aa53f65442548f54db5528557555c2560f565c56a956f75744579257e0582f587d58cb591a596959b85a075a565aa65af55b455b955be5 +5c355c865cd65d275d785dc95e1a5e6c5ebd5f0f5f615fb36005605760aa60fc614f61a261f56249629c62f06343639763eb6440649464e9653d659265e7663d +669266e8673d679367e9683f689668ec6943699a69f16a486a9f6af76b4f6ba76bff6c576caf6d086d606db96e126e6b6ec46f1e6f786fd1702b708670e0713a +719571f0724b72a67301735d73b87414747074cc7528758575e1763e769b76f8775677b37811786e78cc792a798979e77a467aa57b047b637bc27c217c817ce1 +7d417da17e017e627ec27f237f847fe5804780a8810a816b81cd8230829282f4835783ba841d848084e3854785ab860e867286d7873b879f8804886988ce8933 +899989fe8a648aca8b308b968bfc8c638cca8d318d988dff8e668ece8f368f9e9006906e90d6913f91a89211927a92e3934d93b69420948a94f4955f95c99634 +969f970a977597e0984c98b89924999099fc9a689ad59b429baf9c1c9c899cf79d649dd29e409eae9f1d9f8b9ffaa069a0d8a147a1b6a226a296a306a376a3e6 +a456a4c7a538a5a9a61aa68ba6fda76ea7e0a852a8c4a937a9a9aa1caa8fab02ab75abe9ac5cacd0ad44adb8ae2daea1af16af8bb000b075b0eab160b1d6b24b +b2c2b338b3aeb425b49cb513b58ab601b679b6f0b768b7e0b859b8d1b94ab9c2ba3bbab5bb2ebba7bc21bc9bbd15bd8fbe0abe84beffbf7abff5c070c0ecc167 +c1e3c25fc2dbc358c3d4c451c4cec54bc5c8c646c6c3c741c7bfc83dc8bcc93ac9b9ca38cab7cb36cbb6cc35ccb5cd35cdb5ce36ceb6cf37cfb8d039d0bad13c +d1bed23fd2c1d344d3c6d449d4cbd54ed5d1d655d6d8d75cd7e0d864d8e8d96cd9f1da76dafbdb80dc05dc8add10dd96de1cdea2df29dfafe036e0bde144e1cc +e253e2dbe363e3ebe473e4fce584e60de696e71fe7a9e832e8bce946e9d0ea5beae5eb70ebfbec86ed11ed9cee28eeb4ef40efccf058f0e5f172f1fff28cf319 +f3a7f434f4c2f550f5def66df6fbf78af819f8a8f938f9c7fa57fae7fb77fc07fc98fd29fdbafe4bfedcff6dffffffdb00430005040404040305040404060505 +06080d0808070708100b0c090d131014131210121214171d1914161c1612121a231a1c1e1f212121141924272420261d202120ffdb0043010506060807080f08 +080f201512152020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020ffc2001108008500 +c803011100021101031101ffc4001c0000010501010100000000000000000000020001030405060708ffc4001a01000301010101000000000000000000000102 +0300040506ffda000c03010002100310000001cef90fb09a72952d6a75cbea4d4e63c8fb5e46b47b7ac13eefbbc600d1ab0076056ce0100416acafcf72f764c5 +b2c88cb7775e3dda7264f5754ed3360db03601aa1102d6c81bcfca398411c56ce32d9d4101c847bb834e8bbcea88916beadd3e3db6907534b4989384bd3c6b57 +b1dcfd09846af1ab3660044e582d96ce0b8c977982f6f3d37b5257036b9fabd4fd0f0a4558fa5e5acf3a5d3e2d4eecbbc3604bdbb9a4db0e6118183157c160f8 +ac5d5a347f15a74410725d6263afe6eaf48eef09d7074996aa0a7cea3eb705d52c4e9e4f68e0af54a83b33289472ae558e432c5c3514af8776343ced624d7f9f +a3d1a27b2b798614fb94983918fcddbe1fd74c0e9e4d74deede75e4647644513222110b640a0d842bc5deb940e0149f97b7dab8d374f1985b3e945d821a0957c +a376f95fa3c359e5ed1e7f5773006e8e5132b156216c81607917b60d2a294e7185b8dbd53cdbc926255e8bdaf1d6c2a6347c89f57cf3ea72e3db9b690fbf799d +368cdd95ca265121608e056f34e8b516a62837675ea796fa5e67a4b9fa6ca6f41fa7f91e7bccf5f3d9f03ba16d27c2d5312d0b197b291d645264e9d652ba065a +6af66b2a894f30e8e9c12dae94b894e8b92d67cef4adcab6267b3f67e7208d2bad7396f84dabbcb61277424f49d5a2d869a74074899200d18311192691ea47a9 +5453a1e6b75ca97edcd61e3ccf93ee36c8e00d09d1b4e85a117571eaa248c8656ab2e55467d044da33a326916bd36d293cf3aef21e9001471538fc3e857d5061 +015379d829436c2eff003e769c84390442da26d20d608ead92cbeb20a9b20c4ac919908ccf21e7fbce529d6557a79bb6bf0d20451dcace54ca995321c81d9308 +b1a28d9aadaef3d1759f31024aee19d779d799ee994ad49e7d274a9372b6996f64be8b7535802719f013a225866d9d73111632309e82d93054684df84e1f55ca +46c80c88a8b2d8cb315972332915b886da9914b6cfb2c61c63c50cc0be311780b366c74acd95ca8b21e05945d642b3653c0b006d19138d710ce8c810c6224766 +0483163097af9b9971a73d2e08a8b2cb84a01b239078160632d85b44c076b2bae4d9958098f173833572f51cff00ffc400271000020202020005050101000000 +0000000102000304110512101314152120222330310632ffda0008010100010502af2d5695cb5d26526ce5d7af51597af22a09996abbf1aea178c707337fa6eb +7a0b73ea42f9f6bc2db338bab55388bc6e341c7e34f6fc79edb8f3dab167b56346e130ccf65c5598bc7d78d67e9e62deb429d9dcdcabee7c44eb5b41e0cea82f +e5f168193fe8ec2389cff5b87fb39bbbf3d66769b982bdb2291aa8c1e19950bb1eea4d76389c2e6fa4cf0db1fa9ce9390b7cce401d0ed17e4f169b71f00c1e0c +36398c6eb1c7c3ec1e173c65e16e6ff4e5375aafa6c1779b15a24e213f10fa732916d194a68b9da70f9de93391c3afe9cf7d56ca1a5b88ad2ca1e995640338f4 +e947d2dfcff418718cdce073fd462fe9cfb3ee8cc00bed16b5184ad70b854bea84f553b09dd67710b4cda45d466d268c99c6659c4cda6e4b2bee27713b4dcd89 +b13719beccdb8b65f9c105b9165cd8f474984377dcdf954f835408b0148d7f90afcbdd64a33b2ae4e4b1fb5c38d768386b8cc3c6cfc756f702455c896ae96ea4 +6a7c18287f36c53d2c3d71f27cc46d59735552a0dce3ff00e06364315c6b84f4f6c366e7c464ada3a56b2fed282cb120eb0d9d51722d8320c39680faca8c3954 +439b54f5f5cb6faecacd8b3ad6d0e35261c1130686088ee816c46f0dcdc0e561b0c2e67dd2f6b02d64ec749f6cd88f5abcb53cbf1dee323b0eb9ca557334b4e4 +7508c138dfe7c19d04d389b84fcf710dc04acb5a3a58a2d606b1c960ac1cb7193de38b9ef5c64f7bc09ef58f0729483ef16caf90e46d38eb94cb563574c2cb37 +3e3c35353460b277967f31f1bcf643522e552f90130eae9e8717630f1c4f4f8f3c8a27934cf2699e5d027998c91b35046cdc833d466f9899d5c5bd1e6c78ee6e +6ccdec16962069d6043111a00c225b6082f58b6033737a9e609e66e7e49f93c3e2796860362c195608996a63d8e5aab3ecdf81f99a86a0604eb06a09a9d60dac +5bf5058ad3e3c373b426769b9b3e0c81a75759e75cb036fe9fecd6a0fa089f316c6116f9d819a9a3e3b80cdc30995b180fd0209a83e8d4226a76222d8606dc33 +535e061319a7ffc40029110002020103030402020300000000000000010211031012131420210430314122324051425271ffda0008010301013f01e75b459059 +50b244e58d8e71334d367a56a8f54ed32bd9c58b77c8b11b628b123d5cabc1063f5733abc875990eb6675923ac97f473a7f4473a5f44b2a6bdac3e116596633d +4cae4458f48c1cbe05e95ff90a38e3f08cb15fb445dd6597d88b2cb3e206577310f4f4f3db224a8911ff0051c6bb6cb2cb2cb225963647cb32ba88c43d22c8cb +7c2c648fdd595d97a5e965912d9bcdc6247a97f88fb704e9d13f0c9331ce9d0c631f7c74a2a8c534677e6bba24ff002858c6467ba23efbd13132cbbf04511c5c +96ce94e94dacd8cd8c51313fa322a6331ca98fbdca84c4597645511661f102f48e46998da912499e2bc11b26ace338f56cdcc7214af4cdfa98dfe3aa1310b241 +7d9cd0fece7c7fd8b1d1ff0004e421c91bc721b1f91c46b47a58dd884c421116bec9414be096392d28a1c14858d0e28a4645e0b1fb5626290d9e4de5c5fc95ae +e1e48af937c5fd925e28716b4a36336338d9c6718e2977d965a290c64e7b476fe48cf69d448ea247348e591c9239246f917236c99c7238cd838b2bb28a1bd18f +4a286b4dc5eb456946d28a36238cd86c66dd1fbf5a5695a228da5e8fddb2fd9bfe4fffc400271100020201030305010101010000000000000102110312132110 +3031042022415132714061ffda0008010201013f019fa66e64bd3325e9e42f4f33624912c523d2637147ac5f230aa5da9cabc0db63b12ae9815cac92349b66d2 +364d9364a92fb256fc9cf6a4ad92545144f846055024210e5a4d6df81636fcb3f87a5f7631268d2513e59055148911e993f48cac4ccb1b565f6a86254868d232 +ae5d18844910e1d084648e8976e86b8a1e2fc1c4998d7cafa3285d26ab921cab1232c3547b715cf4b387e4c985a22bd9432460953a174cd0d2fb504328aa272e +192969e0dc1e435a3711ad0e478766376ba648ea4557bd2b1aa20b828aa2465f06456c6868960543e04acc78235726648417831c8dc46ea2528485a0f81a50a0 +38d74c5fd13fe855d19465f234c7063848795b3fd383511ff492ff00d28a170596722b39341b6cd2fecd2726a66b3273e08b6852470596597d71792bd898bd96 +7c075fa3a18d53e95eca349a19a24478628d9b4cdb34234c4f81aa3fa5c054fc15d1f5a1a348e2fa210a3621ad46cc4d98fe9b31fd3661fa6d43f4d981b58cdb +c68bc68de88f39ba9fd8a66b2d76132cd46a2c691a0aaeb451c0ebaea685919b86b1648fd8e4bebda99e7ad96793495dab351657bafdb7d290e3daa1aff81c7b +147fffc40037100001020304080305090100000000000001000203112110223132041213203041517105618123339192a134404450627282b1d1e1ffda000801 +0100063f02ca665652b036628de46ab1512478545576b1574ea8b5bf3599164591645902c817bb0aeb648b982a78520713bbf4dc9b96799f24760c97741e7373 +e2b61f4aee3021e75dc737a84e6bacd471f671388544f2a6e39dd0296eed00b263109b337db43c329d10733357a96877533de2d4e866c13371f42811c222d98c +100534746efed982d0c71bcda70836caad935426cb9a975aee636b9a5399635d3ba6850703c09a701c954ad56299c56b72689a23a53708aa2f72f6508a94766a +29b44d652a8d280ab82a43430015ec6dd6d6a5ae76ccd55682d73bcd4f665642b2158aaaab66a90fe8a90dc7b291d0dcab064b0540aac0b2a92aacca86c9b5f5 +1c955640561257627c56a8a96a93db3dcc563662b321227d148cfe3b955564c750ba85fed0aeaa8d3eaae43791d94e2682f1e6da2d6911e4f0b5ccbd13a2c8c8 +d2da1b3c9557557184a996482241527c6d4eed2bed7f0615efe21ed08afc43bb4357745d31dfc153c374b3e8a70fc0e2cfd02bbe06ff009d4a1781fab9ca7a4b +2143fd2caabbae7f7bcbbfb5592af00a9c58a18cef5283585a02019a4ecc04369126eea1722b92e4b95b8aff008aec37395c820775388dd76f4064af35d0fb85 +4703c19852598ac4aa2a99aad15372eae4ab6cc2bb10faabc26ab4536b95ec787457adc38375df70afe49fffc400281000030001030304020301010000000000 +00011121314161105171208191a1b1c1d1e1f030f1ffda0008010100013f21350b67f11946af629e589cb0d081f4acc914840cd036363f53549a37b9ed0742d9 +e0c759bac6d3c1a175adf8e8a2b47c1d90e30dbb3e0ffc01cc35bf88fd043766e9b0d8dfaf4e0e06f8285d2b3f22302d122bd28bb22e479a8adb2119c3656eca +70bd98d8fa31b2fa9b8a97ab4518af4e93ca159ccfd06f18572c323472e5c2f86216d32946c6fd7745a275309a2a63078105ee6226d8e84ea1f6b086e6658ca9 +fa0084bd6fa6af819836dcc29c2d09794e8f83225facb4fb18a242e8f52b830431e0cc3178301a55a8bff1e56c15aa16bc19a7034b0cb2acaaf96265213a2d0c +4b69af49b2756a8eee784c5e9a5296a3928c2da0b29b753b2468b979c9f8fd0bbc4f725af495d188b04d56a1ca58b81b35f041265a68e53948ee41cc5b71a255 +884c1806301d12d87f717baee3235941b99fd0584f032aa3a450d6178d847715eb0813685b98bf73f8cc6a19914d9b197535dd94eee4516a84eb44b0ab7b12ec +79379c8af2dab52376f68d207112435bd1de64a455e4d60909b20f818a310d01f608f7c2217bc93486bf9670508741b16bf186f59291865d19195028b2b1e9ba +c861b0740b3ca1bf94bb9a7b78b1cdae105ac41b6d77625c15ba34a8636868a3a411455e4677e84dbe45802a1acd72317f627da8e1468ecf71e97eaef23cab16 +a269e4b5ca9cdb007a0978359f935135b27ec5e169921e40370a497217ba29065ca94d6b50e74dec7414cd062fd435ea9e4d29a0ccd42d7036af2356ccbe25c0 +e32cb6fd0423aff8f8124c27a7bc099f426bf7326a2a5b90abadbeed3f826c2e025f26507ee37c96db35eff900ac4d5b1c1d092d887aa38b185387ba1a771ee6 +a9fd1859181558e893171b3a256b125772824cad36ce61a1af8a33dd5ec7f9412fb7d1e1f92aad7e4e27b5185f14a1f7d21e4c1e95271feb22ef8f627956cc57 +b929ca409cb0f28bd4c8f587d0471911453f229334e480d8c6c1c9c80d4744e2860778d0f413e469968124da57b8db6a36211f7583631db23169f03b81c8c98a +3268522b794f27784bb191289b1da44190c5f5cd121a73237664482920cfa0d2e8347a1c827dce00dc23d6e4904c793429204fa3a15137b928d223c1cb8878c3 +a375a31a87ca141f507eb2b746ba309485d48592040c2d0be9d88431a8c21878e48fffda000c030100020003000000106b0631a8f6e44e0c76cfc6a197c34b9c +d5a5e0fb54038412dabf4d200da8ebcf66ea2642070e2ecb9e6ef3daaf46c64184b704900e32fd4ed133cd5d6d80b812412cbe809ae198e77100949233ff002e +20ddb23398c9b0917cb5da4fe182bd6b94f1d5ab100351287c7902e5b68139144dd8dc3e3dfbc7e8199d0098cd819fcd9cdd48ab2249280e782fc18ff9527a96 +29a4292210f465cc52715aba0ea39c6ebfffc400271101010100020202010304030000000000010011213110415161207191f03081b1d1c1e1f1ffda00080103 +01013f104389cc21c966bb8b6ddacdbb30e6e1760317682083f226bd220c3823aa4ead38b03fa0ff009f083a09fabc1fa37d65a708bdc374189867bb39820fcb +21cfe2c672e5900fd5f13966f2139790bd8fdce02e184441e37f003e0359e01702e4ce9c8e66d0fae3c5cd93bee548c266afa7fcda31820ba967f002116b1cae +5c401e28190d75f05360df682ce5bdd2504f71e0b2ccdf0d8847cc03a472e6757b9b1c1fa7efff0050b2596db9974cd62e4b9b74dc65294b2db6c30ca1b2cbe4 +4c0b7112c96d89e3708f64e79653ee52cb2cb6db107cc11db166992afecff3f7956a1babebb1ec9336f6adf2571cf4ca665996d806b6c693e2316f81e27b2e27 +df3fbc4610c28c4625c382f62678cfce4c08632d91c7814eed516ba5b08788d679e0e667cc46407d2c3d6ec246905c6cb4d5847521f5690c0507cc07b9c81663 +a10cb0624b3af0294a224575e37773bee79e463d486a41f57c39f8eed92b64cb2db6cbe46123ede5053211d42e98e82dac664b938bb865bb0e522a771236dfe3 +fead3ff1ff0057f2e7c1ab5768ce7af1b16c30afb453e47ac1acd36777399d38253d4bfa95f536c296deaf84c27647cdb21c42ecf036186356bc0bb2b94e4891 +ebc0697dac3259b6885e00f86acbddadf56d90109ee594b2ccccc925996dbe0b3c02d41f3620b84060782ca6d999667c659e062134f231110c2b65999966dfe8 +0c30dbe06186dbffc4002511010101000203000104020300000000000100111031214151a120618191b1d1c1e1f1ffda0008010201013f10eede090f86cde188 +f522993dcc8e692bb9c16df12cf19659270fe33c87715e178409e6daf81ff5c06bb6036d03d36becfb6c7460fbac5870f1965924927058274c709b8fdff88c61 +1f6ba664e8d58599e72cb2c9249b4f3fa20e05fd99fdff00d42110cf08009c5e3fd91b2db780b2cb2c9261f1c26cce08e118897cf01c3727e45e2d9cf32a1e9b +620820b2cb2c9274842e92463c30bf86e144125a31eac10e2ebbb20fb11041671964969772ed7a45b89642fde18390e9790f00bce9d30701071965925dec2d3d +465aca26c40fe5261b2e15a0dc9f00b098237252c6218638c25584cb1b133bb116c3a7db6794dff3e61acb6789312be7b00796cbdd9f6f1c09e586e9d41eac5b +3d65e1e5115347867833a3f69cee5f0caf575accfd96badbcbb82d59d3b0e6c1c3f90fa12f9914a60dd1bc935ead1eecedfb26b5f6bc026df0c2b82a19698585 +90e5919a5911213de98888076cabb603a8b599fd5e4e136193931d3abc7026143f46c91a93c197af01f6ff008ff71ee7f25876fe4b07b3fbbc38fe565ece5384 +895f778982cfc5ea58c2f58bcac1e0bacf51f68fb22d8c3ff75fbbf982f7f9b56afe63af927d30fa9ec43f4dfbad271e999725d8b320d9cee17636a7ee0bdcff +0036bd4bedc6c6a71df05e9c9d46f623ea2e4c16c84478eb852d8ec6c42a9da7e251df1bfa326ce0642f76271645bc1e235c78b3e43f6268cb4a489c6fe878ce +05267031c8f1bceb0c360c1271b6f19044ffc400261001000202010305000203000000000000010011213141516171108191a1b1c1e1d1f0f1ffda0008010100 +013f106a73081d65206d754f9e528f6a31aa8a2417d25049ef0c54150cf760d9b8d3da69f12ff497a1083061b1dc4ea2a1babdc7d60e1e5f78b969955bb8c17c +acb2b91f134fd21e710016fb200a3e385383f69b3f8517b53f647b0a1e84b0b8b059901a9b0b2e817ea0bea420f58a4c3823486dfc8f7b6b65611c16e282ad41 +19940776a3fb25a306218e8a7955411a7ba1a06ca6d7f04d18d4e0852ac688bd07d443d2e7a08075ac0f2d1f8ca2cd46d45cb20eb0ea2cf8c67f89627917bebe +aa0b187042585d2186a5dc2c05e652c28971c152e3858611136465f4192199999832e0c2b9ac4c7a49ceabfbb950b8fb6287494617479963f2574800f68ac954 +09783a605f03907112399429a85b12565a615e6ce630c36f41a95a8b2e5c194a3a6fd414000f55b9bb0b1981613acc65db29de86f67f83efd16c4a48220950f5 +a554099b5972455371298c1781e18004036730bc1bf55972e5cb9443a7de56098841cc5d79911e91f4e6acc5aa03f90fe7a54a8461182d89b65ce86c88ac30c8 +962710ac42a5729c3e8692e2e22cbf51a0b0ccd9542cadc0c4036cca42a44a7c4bcd6c1b7eae5460cf6b81f16f79c28c18870d546bd23acdccb13307c59ab942 +42df18cb2f1820de29e6186122303dc09c65bc23d6947183282b2f451084e2a5f8a9f76e2b35d6ae38b4d5e04afce655b61ae513f0fd967e5575d453f631516f +a06e2ba8e405e298e834be60758d11b85310e3710b2a106d48a8b5ce5091017820850bd584af37a038250c9e59596f726ea68f8ce66dd656058a7497b36d406e +1a45a95728226bd721cc1c415d8a5825059302b022e3761d0387bc52e2594ab58b51531a9530f78d4a08232c15fb91101744b218b788540f7a181e68f50f961b +bf662621fad162bb827544c5ae521f1a1a5ea092d4a4ea471431d259e9f80cc415172d6a16e501a5973f3a992b7804b4546570a2e2cfd90ce1e4e2c4bf8b9a97 +8aadf785dd6bb12985f45e21535f2c3c08f0c7758f94e2bf12ed50ee131c55d897e308bb1fd4b6fcf65528176bde07af9a5e501ed0ce068e122ceed9f83679d4 +6b0484094ecde6215300ad41d7fac4c933b6781b8a42f217e046258aacdbe496e20bb1b672d388e62054001cda95de51511558e3748665ca8cf4581969ced4f6 +96590f78a6d5792e3fc119468afa89b9d8b648bb674da00d95e0b8ab869188f4bd40fdf68003e61d3460a2b59a8ad18ca6fd930ccbdc7e88a63c95fccb74bc0b +f6156475c2cd48991b175b90edbadb71e0081581c15742794109341b05e5d07b0cbe97b5ff00b595ed53722e09798bc63f0090b180f6250d08e82cb0643ce670 +d18cb7c29cc23b12b657de39758e344c8103a071e61775a1ff00d599a4401dd36e4ed070095f113350f1275bf92a9156bfe728b1ef7fc41f47b7fa4ca01ef7fe +100cfe12ff006da75f88970b4ea2fc852a4e88fb86a7466d622be521be5ab98523ac7f0b21648ed4e52a591cd2d805f9964ca8e0b83e503d79966f50dc0bd26f +a83d1a96885ec6f8978b8b16912248d6b1122abc434dfd095a498e8c54ee864a3bd457b116f09ed2fb21c54ab168350ff12b27cebea53d81051043975300bb51 +999345092c84f28330c75876435eb8260bc01e605c2e183044b8a60458c572e63c13da353dc403097c4a76107883c1021a22e6611790992753c896e44cf5aba9 +1fa216094904236f31846e62ba865704e489c910b51299c4af5348458c449885f246505b4251fc881b090738778210c41070d44bc45e6386483d677263151c16 +41ed0eb10c36466a0ed1259166a013094d88a6c83e210c4b2c6e12e67040992585826462a3ac5a84b89527ffd9}}}}} + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225\rtlch \ltrch\loch + +\par \pard\plain \s51\sl288\slmult1\ql\nowidctlpar\hyphpar0\sb0\sa140\ltrpar\cf17\dbch\af9\langfe2052\dbch\af13\afs24\alang1081\loch\f3\fs24\lang1033\qj\widctlpar\sb0\sa225{\scaps0\caps0\cf1\expnd0\expndtw0\i0\b0\dbch\af12\rtlch \ltrch\loch\fs21\loch\f8\hich\af8 +Nunc ac faucibus odio. Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. } +\par } \ No newline at end of file diff --git a/modules/files/src/test/resources/examples/sample.xls b/modules/files/src/test/resources/examples/sample.xls new file mode 100644 index 00000000..1464ee15 Binary files /dev/null and b/modules/files/src/test/resources/examples/sample.xls differ diff --git a/modules/files/src/test/resources/examples/sample.xlsx b/modules/files/src/test/resources/examples/sample.xlsx new file mode 100644 index 00000000..ffe12708 Binary files /dev/null and b/modules/files/src/test/resources/examples/sample.xlsx differ diff --git a/modules/files/src/test/resources/letter-de.html b/modules/files/src/test/resources/letter-de.html new file mode 100755 index 00000000..48ad7be0 --- /dev/null +++ b/modules/files/src/test/resources/letter-de.html @@ -0,0 +1,30 @@ + + + + + + + +

+            
+Max Mustermann
+Lilienweg 21
+12345 Nebendorf
+E-Mail: max.muster@gmail.com
+            
+        
+

Max Mustermann, Lilienweg 21, 12345 Nebendorf

+

EasyCare AG
Abteilung Buchhaltung
Ackerweg 12
12346 Ulmen

+

Nebendorf, 3. September 2019

+

Sehr geehrte Damen und Herren

+

hiermit kündige ich meine Mitgliedschaft in der Kranken- und Pflegeversicherung zum nächstmöglichen Termin.

+

Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen eine Kündigungsbestätigung zu.

+

Vielen Dank im Vorraus!

+

Mit freundlichen Grüßen

+

Max Mustermann

+ + diff --git a/modules/files/src/test/resources/letter-de.md b/modules/files/src/test/resources/letter-de.md new file mode 100644 index 00000000..2941a2db --- /dev/null +++ b/modules/files/src/test/resources/letter-de.md @@ -0,0 +1,29 @@ + Max Mustermann + Lilienweg 21 + 12345 Nebendorf + E-Mail: max.muster@gmail.com + +Max Mustermann, Lilienweg 21, 12345 Nebendorf + + +EasyCare AG
+Abteilung Buchhaltung
+Ackerweg 12
+12346 Ulmen
+ + +Nebendorf, 3. September 2019 +## Sehr geehrte Damen und Herren, + +hiermit kündige ich meine Mitgliedschaft in der Kranken- und +Pflegeversicherung zum *nächstmöglichen* Termin. + +Bitte senden Sie mir innerhalb der gesetzlichen Frist von **14 Tagen** +eine Kündigungsbestätigung zu. + + +Vielen Dank im Vorraus! + +Mit freundlichen Grüßen + +Max Mustermann diff --git a/modules/text/src/test/resources/letter-de-source.pdf b/modules/files/src/test/resources/letter-de.pdf similarity index 100% rename from modules/text/src/test/resources/letter-de-source.pdf rename to modules/files/src/test/resources/letter-de.pdf diff --git a/modules/files/src/test/resources/letter-de.txt b/modules/files/src/test/resources/letter-de.txt new file mode 100644 index 00000000..a559c152 --- /dev/null +++ b/modules/files/src/test/resources/letter-de.txt @@ -0,0 +1,30 @@ +Max Mustermann + +Lilienweg 21 + +12345 Nebendorf + +E-Mail: max.muster@gmail.com + +Max Mustermann, Lilienweg 21, 12345 Nebendorf + +EasyCare AG +Abteilung Buchhaltung +Ackerweg 12 + +12346 Ulmen + +Nebendorf, 3. September 2019 +Sehr geehrte Damen und Herren, + +hiermit kündige ich meine Mitgliedschaft in der Kranken- und Pflegeversicherung zum +nächstmöglichen Termin. + +Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen eine Kündigungsbe- +stätigung zu. + +Vielen Dank im Vorraus! + +Mit freundlichen Grüßen + +Max Mustermann diff --git a/modules/text/src/test/resources/letter-en-source.pdf b/modules/files/src/test/resources/letter-en.pdf similarity index 100% rename from modules/text/src/test/resources/letter-en-source.pdf rename to modules/files/src/test/resources/letter-en.pdf diff --git a/modules/files/src/test/resources/letter-en.txt b/modules/files/src/test/resources/letter-en.txt new file mode 100644 index 00000000..b7051bc4 --- /dev/null +++ b/modules/files/src/test/resources/letter-en.txt @@ -0,0 +1,38 @@ +Derek Jeter + +123 Elm Ave. + +Treesville, ON M1N 2P3 +November 7, 2016 + +Derek Jeter, 123 Elm Ave., Treesville, ON M1N 2P3, November 7, 2016 + +Mr. M. Leaf + +Chief of Syrup Production +Old Sticky Pancake Company +456 Maple Lane + +Forest, ON 7W8 9Y0 + +Hemptown, September 3, 2019 +Dear Mr. Leaf, + +Let me begin by thanking you for your past contributions to our Little League baseball +team. Your sponsorship aided in the purchase of ten full uniforms and several pieces of +baseball equipment for last year’s season. + +Next month, our company is planning an employee appreciation pancake breakfast hon- +oring retired employees for their past years of service and present employees for their +loyalty and dedication in spite of the current difficult economic conditions. + +We would like to place an order with your company for 25 pounds of pancake mix and +five gallons of maple syrup. We hope you will be able to provide these products in the +bulk quantities we require. + +As you are a committed corporate sponsor and long-time associate, we hope that you +will be able to join us for breakfast on December 12, 2016. + +Respectfully yours, + +Derek Jeter diff --git a/modules/text/src/test/resources/logback.xml b/modules/files/src/test/resources/logback-test.xml similarity index 71% rename from modules/text/src/test/resources/logback.xml rename to modules/files/src/test/resources/logback-test.xml index 5b0b6a44..fdc4bdf7 100644 --- a/modules/text/src/test/resources/logback.xml +++ b/modules/files/src/test/resources/logback-test.xml @@ -3,12 +3,12 @@ true - [%thread] %highlight(%-5level) %cyan(%logger{15}) - %msg %n + %highlight(%-5level) %cyan(%logger{15}) - %msg %n - + diff --git a/modules/files/src/test/resources/scanner/jfif.jpg b/modules/files/src/test/resources/scanner/jfif.jpg new file mode 100755 index 00000000..215664be Binary files /dev/null and b/modules/files/src/test/resources/scanner/jfif.jpg differ diff --git a/modules/files/src/test/resources/scanner/pdf13.pdf b/modules/files/src/test/resources/scanner/pdf13.pdf new file mode 100755 index 00000000..9411060d Binary files /dev/null and b/modules/files/src/test/resources/scanner/pdf13.pdf differ diff --git a/modules/files/src/test/resources/scanner/pdfa14.pdf b/modules/files/src/test/resources/scanner/pdfa14.pdf new file mode 100755 index 00000000..ec3712c3 Binary files /dev/null and b/modules/files/src/test/resources/scanner/pdfa14.pdf differ diff --git a/modules/files/src/test/scala/docspell/files/ExampleFilesSupport.scala b/modules/files/src/test/scala/docspell/files/ExampleFilesSupport.scala new file mode 100644 index 00000000..e08962a9 --- /dev/null +++ b/modules/files/src/test/scala/docspell/files/ExampleFilesSupport.scala @@ -0,0 +1,14 @@ +package docspell.files + +import docspell.common._ + +trait ExampleFilesSupport { + + def createUrl(resource: String): LenientUri = + Option(getClass.getResource("/" + resource)) match { + case Some(u) => LenientUri.fromJava(u) + case None => sys.error(s"Resource '$resource' not found") + } + + +} diff --git a/modules/files/src/test/scala/docspell/files/ImageSizeTest.scala b/modules/files/src/test/scala/docspell/files/ImageSizeTest.scala new file mode 100644 index 00000000..ac3bce6b --- /dev/null +++ b/modules/files/src/test/scala/docspell/files/ImageSizeTest.scala @@ -0,0 +1,46 @@ +package docspell.files + +import cats.implicits._ +import cats.effect.{Blocker, IO} +import minitest.SimpleTestSuite + +import scala.concurrent.ExecutionContext +import scala.util.Using + +object ImageSizeTest extends SimpleTestSuite { + val blocker = Blocker.liftExecutionContext(ExecutionContext.global) + implicit val CS = IO.contextShift(ExecutionContext.global) + + //tiff files are not supported on the jdk by default + //requires an external library + val files = List( + ExampleFiles.camera_letter_en_jpg -> Dimension(1695, 2378), + ExampleFiles.camera_letter_en_png -> Dimension(1695, 2378), +// ExampleFiles.camera_letter_en_tiff -> Dimension(1695, 2378), + ExampleFiles.scanner_jfif_jpg -> Dimension(2480, 3514), + ExampleFiles.bombs_20K_gray_jpeg -> Dimension(20000, 20000), + ExampleFiles.bombs_20K_gray_png -> Dimension(20000, 20000), + ExampleFiles.bombs_20K_rgb_jpeg -> Dimension(20000, 20000), + ExampleFiles.bombs_20K_rgb_png -> Dimension(20000, 20000) + ) + + test("get sizes from input-stream") { + files.foreach { + case (uri, expect) => + val url = uri.toJavaUrl.fold(sys.error, identity) + Using.resource(url.openStream()) { in => + val dim = ImageSize.get(in) + assertEquals(dim, expect.some) + } + } + } + + test("get sizes from stream") { + files.foreach { + case (uri, expect) => + val stream = uri.readURL[IO](8192, blocker) + val dim = ImageSize.get(stream).unsafeRunSync() + assertEquals(dim, expect.some) + } + } +} diff --git a/modules/files/src/test/scala/docspell/files/Playing.scala b/modules/files/src/test/scala/docspell/files/Playing.scala new file mode 100644 index 00000000..ae87bd2d --- /dev/null +++ b/modules/files/src/test/scala/docspell/files/Playing.scala @@ -0,0 +1,25 @@ +package docspell.files + +import cats.effect.{Blocker, ExitCode, IO, IOApp} +import docspell.common.MimeTypeHint + +import scala.concurrent.ExecutionContext + +object Playing extends IOApp { + val blocker = Blocker.liftExecutionContext(ExecutionContext.global) + + + def run(args: List[String]): IO[ExitCode] = IO { + //val ods = ExampleFiles.examples_sample_ods.readURL[IO](8192, blocker) + //val odt = ExampleFiles.examples_sample_odt.readURL[IO](8192, blocker) + val rtf = ExampleFiles.examples_sample_rtf.readURL[IO](8192, blocker) + + val x = for { + odsm1 <- TikaMimetype.detect(rtf, + MimeTypeHint.filename(ExampleFiles.examples_sample_rtf.path.segments.last)) + odsm2 <- TikaMimetype.detect(rtf, MimeTypeHint.none) + } yield (odsm1, odsm2) + println(x.unsafeRunSync()) + ExitCode.Success + } +} diff --git a/modules/files/src/test/scala/docspell/files/TestFiles.scala b/modules/files/src/test/scala/docspell/files/TestFiles.scala new file mode 100644 index 00000000..1ee01c9a --- /dev/null +++ b/modules/files/src/test/scala/docspell/files/TestFiles.scala @@ -0,0 +1,29 @@ +package docspell.files + +import cats.effect.{Blocker, IO} +import fs2.Stream + +import scala.concurrent.ExecutionContext + +object TestFiles { + val blocker = Blocker.liftExecutionContext(ExecutionContext.global) + implicit val CS = IO.contextShift(ExecutionContext.global) + + val letterSourceDE: Stream[IO, Byte] = + ExampleFiles.letter_de_pdf + .readURL[IO](8 * 1024, blocker) + + val letterSourceEN: Stream[IO, Byte] = + ExampleFiles.letter_en_pdf + .readURL[IO](8 * 1024, blocker) + + lazy val letterDEText = + ExampleFiles.letter_de_txt + .readText[IO](8 * 1024, blocker) + .unsafeRunSync + + lazy val letterENText = + ExampleFiles.letter_en_txt + .readText[IO](8 * 1024, blocker) + .unsafeRunSync +} diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 79b68912..9712f54d 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -65,67 +65,168 @@ docspell.joex { } # Configuration of text extraction - # - # Extracting text currently only work for image and pdf files. It - # will first runs ghostscript to create a gray image from a - # pdf. Then unpaper is run to optimize the image for the upcoming - # ocr, which will be done by tesseract. All these programs must be - # available in your PATH or the absolute path can be specified - # below. extraction { - allowed-content-types = [ "application/pdf", "image/jpeg", "image/png" ] - - # Defines what pages to process. If a PDF with 600 pages is - # submitted, it is probably not necessary to scan through all of - # them. This would take a long time and occupy resources for no - # value. The first few pages should suffice. The default is first - # 10 pages. - # - # If you want all pages being processed, set this number to -1. - # - # Note: if you change the ghostscript command below, be aware that - # this setting (if not -1) will add another parameter to the - # beginning of the command. - page-range { - begin = 10 + # For PDF files it is first tried to read the text parts of the + # PDF. But PDFs can be complex documents and they may contain text + # and images. If the returned text is shorter than the value + # below, OCR is run afterwards. Then both extracted texts are + # compared and the longer will be used. + pdf { + min-text-len = 10 } - # The ghostscript command. - ghostscript { - command { - program = "gs" - args = [ "-dNOPAUSE" - , "-dBATCH" - , "-dSAFER" - , "-sDEVICE=tiffscaled8" - , "-sOutputFile={{outfile}}" - , "{{infile}}" - ] - timeout = "5 minutes" + # Extracting text using OCR works for image and pdf files. It will + # first run ghostscript to create a gray image from a pdf. Then + # unpaper is run to optimize the image for the upcoming ocr, which + # will be done by tesseract. All these programs must be available + # in your PATH or the absolute path can be specified below. + ocr { + + # Images greater than this size are skipped. Note that every + # image is loaded completely into memory for doing OCR. + max-image-size = 14000000 + + # Defines what pages to process. If a PDF with 600 pages is + # submitted, it is probably not necessary to scan through all of + # them. This would take a long time and occupy resources for no + # value. The first few pages should suffice. The default is first + # 10 pages. + # + # If you want all pages being processed, set this number to -1. + # + # Note: if you change the ghostscript command below, be aware that + # this setting (if not -1) will add another parameter to the + # beginning of the command. + page-range { + begin = 10 } - working-dir = ${java.io.tmpdir}"/docspell-extraction" - } - # The unpaper command. - unpaper { - command { - program = "unpaper" - args = [ "{{infile}}", "{{outfile}}" ] - timeout = "5 minutes" + # The ghostscript command. + ghostscript { + command { + program = "gs" + args = [ "-dNOPAUSE" + , "-dBATCH" + , "-dSAFER" + , "-sDEVICE=tiffscaled8" + , "-sOutputFile={{outfile}}" + , "{{infile}}" + ] + timeout = "5 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-extraction" } - } - # The tesseract command. - tesseract { - command { - program = "tesseract" - args = ["{{file}}" - , "stdout" - , "-l" - , "{{lang}}" - ] - timeout = "5 minutes" + # The unpaper command. + unpaper { + command { + program = "unpaper" + args = [ "{{infile}}", "{{outfile}}" ] + timeout = "5 minutes" + } + } + + # The tesseract command. + tesseract { + command { + program = "tesseract" + args = ["{{file}}" + , "stdout" + , "-l" + , "{{lang}}" + ] + timeout = "5 minutes" + } } } } + + # Configuration for converting files into PDFs. + # + # Most of it is delegated to external tools, which can be configured + # below. They must be in the PATH environment or specify the full + # path below via the `program` key. + convert { + + # The chunk size used when storing files. This should be the same + # as used with the rest server. + chunk-size = 524288 + + # When reading images, this is the maximum size. Images that are + # larger are not processed. + max-image-size = ${docspell.joex.extraction.ocr.max-image-size} + + # Settings when processing markdown files (and other text files) + # to HTML. + # + # In order to support text formats, text files are first converted + # to HTML using a markdown processor. The resulting HTML is then + # converted to a PDF file. + markdown { + + # The CSS that is used to style the resulting HTML. + internal-css = """ + body { padding: 2em 5em; } + """ + } + + # To convert HTML files into PDF files, the external tool + # wkhtmltopdf is used. + wkhtmlpdf { + command = { + program = "wkhtmltopdf" + args = [ + "-s", + "A4", + "--encoding", + "UTF-8", + "-", + "{{outfile}}" + ] + timeout = "2 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + + # To convert image files to PDF files, tesseract is used. This + # also extracts the text in one go. + tesseract = { + command = { + program = "tesseract" + args = [ + "{{infile}}", + "out", + "-l", + "{{lang}}", + "pdf", + "txt" + ] + timeout = "5 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + + # To convert "office" files to PDF files, the external tool + # unoconv is used. Unoconv uses libreoffice/openoffice for + # converting. So it supports all formats that are possible to read + # with libreoffice/openoffic. + # + # Note: to greatly improve performance, it is recommended to start + # a libreoffice listener by running `unoconv -l` in a separate + # process. + unoconv = { + command = { + program = "unoconv" + args = [ + "-f", + "pdf", + "-o", + "{{outfile}}", + "{{infile}}" + ] + timeout = "2 minutes" + } + working-dir = ${java.io.tmpdir}"/docspell-convert" + } + } } \ No newline at end of file diff --git a/modules/joex/src/main/scala/docspell/joex/Config.scala b/modules/joex/src/main/scala/docspell/joex/Config.scala index b8f6b7ff..62ad3aad 100644 --- a/modules/joex/src/main/scala/docspell/joex/Config.scala +++ b/modules/joex/src/main/scala/docspell/joex/Config.scala @@ -3,25 +3,19 @@ package docspell.joex import docspell.common.{Ident, LenientUri} import docspell.joex.scheduler.SchedulerConfig import docspell.store.JdbcConfig -import docspell.text.ocr.{Config => OcrConfig} +import docspell.convert.ConvertConfig +import docspell.extract.ExtractConfig case class Config( - appId: Ident, - baseUrl: LenientUri, - bind: Config.Bind, - jdbc: JdbcConfig, - scheduler: SchedulerConfig, - extraction: OcrConfig + appId: Ident, + baseUrl: LenientUri, + bind: Config.Bind, + jdbc: JdbcConfig, + scheduler: SchedulerConfig, + extraction: ExtractConfig, + convert: ConvertConfig ) object Config { - val postgres = - JdbcConfig(LenientUri.unsafe("jdbc:postgresql://localhost:5432/docspelldev"), "dev", "dev") - val h2 = JdbcConfig( - LenientUri.unsafe("jdbc:h2:./target/docspelldev.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE"), - "sa", - "" - ) - case class Bind(address: String, port: Int) } diff --git a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala index 374cf396..34270987 100644 --- a/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala +++ b/modules/joex/src/main/scala/docspell/joex/JoexAppImpl.scala @@ -55,7 +55,7 @@ object JoexAppImpl { .withTask( JobTask.json( ProcessItemArgs.taskName, - ItemHandler[F](cfg.extraction), + ItemHandler[F](cfg), ItemHandler.onCancel[F] ) ) diff --git a/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala new file mode 100644 index 00000000..a1035da1 --- /dev/null +++ b/modules/joex/src/main/scala/docspell/joex/process/ConvertPdf.scala @@ -0,0 +1,125 @@ +package docspell.joex.process + +import bitpeace.{Mimetype, MimetypeHint, RangeDef} +import cats.implicits._ +import cats.Functor +import cats.implicits._ +import cats.effect._ +import cats.data.{Kleisli, OptionT} +import fs2.Stream +import docspell.common._ +import docspell.convert._ +import docspell.joex.scheduler._ +import docspell.store.records._ +import docspell.convert.ConversionResult.Handler + +/** Goes through all attachments and creates a PDF version of it where + * supported. + * + * The `attachment` record is updated with the PDF version while the + * original file has been stored in the `attachment_source` record. + * + * If pdf conversion is not possible or if the input is already a + * pdf, both files are identical. That is, the `file_id`s point to + * the same file. Since the name of an attachment may be changed by + * the user, the `attachment_origin` record keeps that, too. + * + * This step assumes an existing premature item, it traverses its + * attachments. + */ +object ConvertPdf { + + def apply[F[_]: Sync: ContextShift]( + cfg: ConvertConfig, + item: ItemData + ): Task[F, ProcessItemArgs, ItemData] = + Task { ctx => + def convert(ra: RAttachment) = + findMime(ctx)(ra).flatMap(m => convertSafe(cfg, ctx, item)(ra, m)) + + for { + ras <- item.attachments.traverse(convert) + nra = ras.map(_._1) + nma = ras.flatMap(_._2) + } yield item.copy(attachments = nra, metas = nma) + + } + + def findMime[F[_]: Functor](ctx: Context[F, _])(ra: RAttachment): F[Mimetype] = + OptionT(ctx.store.transact(RFileMeta.findById(ra.fileId))) + .map(_.mimetype) + .getOrElse(Mimetype.`application/octet-stream`) + + def convertSafe[F[_]: Sync: ContextShift]( + cfg: ConvertConfig, + ctx: Context[F, ProcessItemArgs], + item: ItemData + )(ra: RAttachment, mime: Mimetype): F[(RAttachment, Option[RAttachmentMeta])] = + Conversion.create[F](cfg, ctx.blocker, ctx.logger).use { conv => + mime match { + case Mimetype.`application/pdf` => + ctx.logger.info("Not going to convert a PDF file into a PDF.") *> + (ra, None: Option[RAttachmentMeta]).pure[F] + + case _ => + val data = ctx.store.bitpeace + .get(ra.fileId.id) + .unNoneTerminate + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) + val handler = conversionHandler[F](ctx, cfg, ra, item) + ctx.logger.info(s"Converting file ${ra.name} (${mime.asString}) into a PDF") *> + conv.toPDF(DataType(MimeType(mime.primary, mime.sub)), ctx.args.meta.language, handler)(data) + } + } + + private def conversionHandler[F[_]: Sync]( + ctx: Context[F, ProcessItemArgs], + cfg: ConvertConfig, + ra: RAttachment, + item: ItemData + ): Handler[F, (RAttachment, Option[RAttachmentMeta])] = + Kleisli({ + case ConversionResult.SuccessPdf(pdf) => + ctx.logger.info(s"Conversion to pdf successful. Saving file.") *> + storePDF(ctx, cfg, ra, pdf) + .map(r => (r, None)) + + case ConversionResult.SuccessPdfTxt(pdf, txt) => + ctx.logger.info(s"Conversion to pdf+txt successful. Saving file.") *> + storePDF(ctx, cfg, ra, pdf) + .flatMap(r => + txt.map(t => (r, item.changeMeta(ra.id, _.setContentIfEmpty(t.some)).some)) + ) + + case ConversionResult.UnsupportedFormat(mt) => + ctx.logger.info(s"PDF conversion for type ${mt.asString} not supported!") *> + (ra, None: Option[RAttachmentMeta]).pure[F] + + case ConversionResult.InputMalformed(mt, reason) => + ctx.logger.info( + s"PDF conversion from type ${mt.asString} reported malformed input: $reason." + ) *> + (ra, None: Option[RAttachmentMeta]).pure[F] + + case ConversionResult.Failure(ex) => + ctx.logger.error(s"PDF conversion failed: ${ex.getMessage}. Go without PDF file") *> + (ra, None: Option[RAttachmentMeta]).pure[F] + }) + + private def storePDF[F[_]: Sync]( + ctx: Context[F, ProcessItemArgs], + cfg: ConvertConfig, + ra: RAttachment, + pdf: Stream[F, Byte] + ) = { + val hint = MimeTypeHint.advertised(MimeType.pdf).withName(ra.name.getOrElse("file.pdf")) + val newName = ra.name.map(n => s"$n.pdf") + ctx.store.bitpeace + .saveNew(pdf, cfg.chunkSize, MimetypeHint(hint.filename, hint.advertised)) + .compile + .lastOrError + .map(fm => Ident.unsafe(fm.id)) + .flatMap(fmId => ctx.store.transact(RAttachment.updateFileIdAndName(ra.id, fmId, newName)).map(_ => fmId)) + .map(fmId => ra.copy(fileId = fmId, name = newName)) + } +} diff --git a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala index 916974d1..6eada36d 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/CreateItem.scala @@ -1,12 +1,14 @@ package docspell.joex.process +import bitpeace.FileMeta import cats.implicits._ import cats.effect.Sync +import cats.data.OptionT import fs2.Stream import docspell.common._ import docspell.joex.scheduler.{Context, Task} import docspell.store.queries.QItem -import docspell.store.records.{RAttachment, RItem} +import docspell.store.records.{RAttachment, RAttachmentSource, RItem} /** * Task that creates the item. @@ -21,13 +23,15 @@ object CreateItem { def createNew[F[_]: Sync]: Task[F, ProcessItemArgs, ItemData] = Task { ctx => - val validFiles = ctx.args.meta.validFileTypes.map(_.asString).toSet + def isValidFile(fm: FileMeta) = + ctx.args.meta.validFileTypes.isEmpty || + ctx.args.meta.validFileTypes.map(_.asString).toSet.contains(fm.mimetype.baseType) def fileMetas(itemId: Ident, now: Timestamp) = Stream .emits(ctx.args.files) .flatMap(f => ctx.store.bitpeace.get(f.fileMetaId.id).map(fm => (f, fm))) - .collect({ case (f, Some(fm)) if validFiles.contains(fm.mimetype.baseType) => f }) + .collect({ case (f, Some(fm)) if isValidFile(fm) => f }) .zipWithIndex .evalMap({ case (f, index) => @@ -53,13 +57,21 @@ object CreateItem { n <- ctx.store.transact(RItem.insert(it)) _ <- if (n != 1) storeItemError[F](ctx) else ().pure[F] fm <- fileMetas(it.id, it.created) - k <- fm.traverse(a => ctx.store.transact(RAttachment.insert(a))) + k <- fm.traverse(insertAttachment(ctx)) _ <- logDifferences(ctx, fm, k.sum) dur <- time _ <- ctx.logger.info(s"Creating item finished in ${dur.formatExact}") - } yield ItemData(it, fm, Vector.empty, Vector.empty) + } yield ItemData(it, fm, Vector.empty, Vector.empty, fm.map(a => a.id -> a.fileId).toMap) } + def insertAttachment[F[_]: Sync](ctx: Context[F, ProcessItemArgs])(ra: RAttachment): F[Int] = { + val rs = RAttachmentSource.of(ra) + ctx.store.transact(for { + n <- RAttachment.insert(ra) + _ <- RAttachmentSource.insert(rs) + } yield n) + } + def findExisting[F[_]: Sync]: Task[F, ProcessItemArgs, Option[ItemData]] = Task { ctx => for { @@ -69,12 +81,18 @@ object CreateItem { ht <- cand.drop(1).traverse(ri => QItem.delete(ctx.store)(ri.id, ri.cid)) _ <- if (ht.sum > 0) ctx.logger.warn(s"Removed ${ht.sum} items with same attachments") else ().pure[F] - rms <- cand.headOption.traverse(ri => - ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid)) - ) - } yield cand.headOption.map(ri => - ItemData(ri, rms.getOrElse(Vector.empty), Vector.empty, Vector.empty) - ) + rms <- OptionT( + cand.headOption.traverse(ri => + ctx.store.transact(RAttachment.findByItemAndCollective(ri.id, ri.cid)) + ) + ).getOrElse(Vector.empty) + orig <- rms.traverse(a => + ctx.store.transact(RAttachmentSource.findById(a.id)).map(s => (a, s)) + ) + origMap = orig + .map(originFileTuple) + .toMap + } yield cand.headOption.map(ri => ItemData(ri, rms, Vector.empty, Vector.empty, origMap)) } private def logDifferences[F[_]: Sync]( @@ -94,4 +112,8 @@ object CreateItem { val msg = "Inserting item failed. DB returned 0 update count!" ctx.logger.error(msg) *> Sync[F].raiseError(new Exception(msg)) } + + //TODO if no source is present, it must be saved! + private def originFileTuple(t: (RAttachment, Option[RAttachmentSource])): (Ident, Ident) = + t._2.map(s => s.id -> s.fileId).getOrElse(t._1.id -> t._1.fileId) } diff --git a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala index c8b57365..c9aff410 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/FindProposal.scala @@ -5,11 +5,11 @@ import java.time.ZoneId import cats.{Applicative, FlatMap} import cats.implicits._ import cats.effect.Sync +import docspell.analysis.contact._ import docspell.common.MetaProposal.Candidate import docspell.common._ import docspell.joex.scheduler.{Context, Task} -import docspell.store.records.{RAttachmentMeta, REquipment, ROrganization, RPerson} -import docspell.text.contact.Domain +import docspell.store.records._ /** Super simple approach to find corresponding meta data to an item * by looking up values from NER in the users address book. diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala index a6f751f7..b9fd22c4 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemData.scala @@ -8,7 +8,8 @@ case class ItemData( item: RItem, attachments: Vector[RAttachment], metas: Vector[RAttachmentMeta], - dateLabels: Vector[AttachmentDates] + dateLabels: Vector[AttachmentDates], + originFile: Map[Ident, Ident] //maps RAttachment.id -> FileMeta.id ) { def findMeta(attachId: Ident): Option[RAttachmentMeta] = @@ -16,6 +17,21 @@ case class ItemData( def findDates(rm: RAttachmentMeta): Vector[NerDateLabel] = dateLabels.find(m => m.rm.id == rm.id).map(_.dates).getOrElse(Vector.empty) + + def mapMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): ItemData = { + val item = changeMeta(attachId, f) + val next = metas.map(a => if (a.id == attachId) item else a) + copy(metas = next) + } + + def changeMeta(attachId: Ident, f: RAttachmentMeta => RAttachmentMeta): RAttachmentMeta = + f(findOrCreate(attachId)) + + def findOrCreate(attachId: Ident): RAttachmentMeta = + metas.find(_.id == attachId).getOrElse { + RAttachmentMeta.empty(attachId) + } + } object ItemData { diff --git a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala index 0d7dda6d..66104e96 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ItemHandler.scala @@ -3,16 +3,16 @@ package docspell.joex.process import cats.implicits._ import cats.effect.{ContextShift, Sync} import docspell.common.{ItemState, ProcessItemArgs} +import docspell.joex.Config import docspell.joex.scheduler.{Context, Task} import docspell.store.queries.QItem import docspell.store.records.{RItem, RJob} -import docspell.text.ocr.{Config => OcrConfig} object ItemHandler { def onCancel[F[_]: Sync: ContextShift]: Task[F, ProcessItemArgs, Unit] = logWarn("Now cancelling. Deleting potentially created data.").flatMap(_ => deleteByFileIds) - def apply[F[_]: Sync: ContextShift](cfg: OcrConfig): Task[F, ProcessItemArgs, Unit] = + def apply[F[_]: Sync: ContextShift](cfg: Config): Task[F, ProcessItemArgs, Unit] = CreateItem[F] .flatMap(itemStateTask(ItemState.Processing)) .flatMap(safeProcess[F](cfg)) @@ -30,7 +30,7 @@ object ItemHandler { } yield last def safeProcess[F[_]: Sync: ContextShift]( - cfg: OcrConfig + cfg: Config )(data: ItemData): Task[F, ProcessItemArgs, ItemData] = Task(isLastRetry[F, ProcessItemArgs] _).flatMap { case true => diff --git a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala index 88d16892..bb67fe03 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/ProcessItem.scala @@ -3,14 +3,15 @@ package docspell.joex.process import cats.effect.{ContextShift, Sync} import docspell.common.ProcessItemArgs import docspell.joex.scheduler.Task -import docspell.text.ocr.{Config => OcrConfig} +import docspell.joex.Config object ProcessItem { def apply[F[_]: Sync: ContextShift]( - cfg: OcrConfig + cfg: Config )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = - TextExtraction(cfg, item) + ConvertPdf(cfg.convert, item) + .flatMap(TextExtraction(cfg.extraction, _)) .flatMap(Task.setProgress(25)) .flatMap(TextAnalysis[F]) .flatMap(Task.setProgress(50)) diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala index 7e4cc13b..ddc3f0c8 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextAnalysis.scala @@ -2,13 +2,13 @@ package docspell.joex.process import cats.implicits._ import cats.effect.Sync -import docspell.common.{Duration, Language, NerLabel, ProcessItemArgs} +import docspell.analysis.nlp._ +import docspell.analysis.contact._ +import docspell.analysis.date._ +import docspell.common._ import docspell.joex.process.ItemData.AttachmentDates import docspell.joex.scheduler.Task import docspell.store.records.RAttachmentMeta -import docspell.text.contact.Contact -import docspell.text.date.DateFind -import docspell.text.nlp.StanfordNerClassifier object TextAnalysis { diff --git a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala index 157fdfee..6f72836d 100644 --- a/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala +++ b/modules/joex/src/main/scala/docspell/joex/process/TextExtraction.scala @@ -1,25 +1,25 @@ package docspell.joex.process -import bitpeace.RangeDef +import bitpeace.{Mimetype, RangeDef} +import cats.data.OptionT import cats.implicits._ -import cats.effect.{Blocker, ContextShift, Sync} -import docspell.common.{Duration, Language, ProcessItemArgs} +import cats.effect.{ContextShift, Sync} +import docspell.common._ +import docspell.extract.{ExtractConfig, ExtractResult, Extraction} import docspell.joex.scheduler.{Context, Task} -import docspell.store.Store -import docspell.store.records.{RAttachment, RAttachmentMeta} -import docspell.text.ocr.{TextExtract, Config => OcrConfig} +import docspell.store.records.{RAttachment, RAttachmentMeta, RFileMeta} object TextExtraction { def apply[F[_]: Sync: ContextShift]( - cfg: OcrConfig, - item: ItemData + cfg: ExtractConfig, + item: ItemData ): Task[F, ProcessItemArgs, ItemData] = Task { ctx => for { _ <- ctx.logger.info("Starting text extraction") start <- Duration.stopTime[F] - txt <- item.attachments.traverse(extractTextToMeta(ctx, cfg, ctx.args.meta.language)) + txt <- item.attachments.traverse(extractTextIfEmpty(ctx, cfg, ctx.args.meta.language, item)) _ <- ctx.logger.debug("Storing extracted texts") _ <- txt.toList.traverse(rm => ctx.store.transact(RAttachmentMeta.upsert(rm))) dur <- start @@ -27,33 +27,99 @@ object TextExtraction { } yield item.copy(metas = txt) } + def extractTextIfEmpty[F[_]: Sync: ContextShift]( + ctx: Context[F, _], + cfg: ExtractConfig, + lang: Language, + item: ItemData + )(ra: RAttachment): F[RAttachmentMeta] = { + val rm = item.findOrCreate(ra.id) + rm.content match { + case Some(_) => + ctx.logger.info("TextExtraction skipped, since text is already available.") *> + rm.pure[F] + case None => + extractTextToMeta[F](ctx, cfg, lang, item)(ra) + } + } + def extractTextToMeta[F[_]: Sync: ContextShift]( ctx: Context[F, _], - cfg: OcrConfig, - lang: Language + cfg: ExtractConfig, + lang: Language, + item: ItemData )(ra: RAttachment): F[RAttachmentMeta] = for { - _ <- ctx.logger.debug(s"Extracting text for attachment ${ra.name}") + _ <- ctx.logger.debug(s"Extracting text for attachment ${stripAttachmentName(ra)}") dst <- Duration.stopTime[F] - txt <- extractText(cfg, lang, ctx.store, ctx.blocker)(ra) - meta = RAttachmentMeta.empty(ra.id).copy(content = txt.map(_.trim).filter(_.nonEmpty)) + txt <- extractTextFallback(ctx, cfg, ra, lang)(filesToExtract(item, ra)) + meta = item.changeMeta(ra.id, rm => rm.setContentIfEmpty(txt.map(_.trim).filter(_.nonEmpty))) est <- dst _ <- ctx.logger.debug( - s"Extracting text for attachment ${ra.name} finished in ${est.formatExact}" + s"Extracting text for attachment ${stripAttachmentName(ra)} finished in ${est.formatExact}" ) } yield meta def extractText[F[_]: Sync: ContextShift]( - ocrConfig: OcrConfig, - lang: Language, - store: Store[F], - blocker: Blocker - )(ra: RAttachment): F[Option[String]] = { - val data = store.bitpeace - .get(ra.fileId.id) + ctx: Context[F, _], + extr: Extraction[F], + lang: Language + )(fileId: Ident): F[ExtractResult] = { + val data = ctx.store.bitpeace + .get(fileId.id) .unNoneTerminate - .through(store.bitpeace.fetchData2(RangeDef.all)) + .through(ctx.store.bitpeace.fetchData2(RangeDef.all)) - TextExtract.extract(data, blocker, lang.iso3, ocrConfig).compile.last + def findMime: F[Mimetype] = + OptionT(ctx.store.transact(RFileMeta.findById(fileId))) + .map(_.mimetype) + .getOrElse(Mimetype.`application/octet-stream`) + + findMime + .flatMap(mt => + extr.extractText(data, DataType(MimeType(mt.primary, mt.sub)), lang)) } + + private def extractTextFallback[F[_]: Sync: ContextShift]( + ctx: Context[F, _], + cfg: ExtractConfig, + ra: RAttachment, + lang: Language, + )(fileIds: List[Ident]): F[Option[String]] = { + fileIds match { + case Nil => + ctx.logger.error(s"Cannot extract text").map(_ => None) + + case id :: rest => + val extr = Extraction.create[F](ctx.blocker, ctx.logger, cfg) + + extractText[F](ctx, extr, lang)(id) + .flatMap({ + case ExtractResult.Success(txt) => + txt.some.pure[F] + + case ExtractResult.UnsupportedFormat(mt) => + ctx.logger.warn(s"Cannot extract text from file ${stripAttachmentName(ra)}: unsupported format ${mt.asString}. Try with converted file."). + flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest)) + + case ExtractResult.Failure(ex) => + ctx.logger.warn(s"Cannot extract text: ${ex.getMessage}. Try with converted file"). + flatMap(_ => extractTextFallback[F](ctx, cfg, ra, lang)(rest)) + }) + } + } + + /** Returns the fileIds to extract text from. First, the source file + * is tried. If that fails, the converted file is tried. + */ + private def filesToExtract(item: ItemData, ra: RAttachment): List[Ident] = + item.originFile.get(ra.id) match { + case Some(sid) => List(sid, ra.fileId).distinct + case None => List(ra.fileId) + } + + private def stripAttachmentName(ra: RAttachment): String = + ra.name + .map(s => if (s.endsWith(".pdf") && s.count(_ == '.') > 1) s.dropRight(4) else s) + .getOrElse("") } diff --git a/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala b/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala index 7ce0b04a..ba1784a0 100644 --- a/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala +++ b/modules/joex/src/main/scala/docspell/joex/scheduler/Context.scala @@ -3,7 +3,7 @@ package docspell.joex.scheduler import cats.Functor import cats.effect.{Blocker, Concurrent} import cats.implicits._ -import docspell.common.Ident +import docspell.common._ import docspell.store.Store import docspell.store.records.RJob import docspell.common.syntax.all._ @@ -52,7 +52,7 @@ object Context { ): F[Context[F, A]] = for { _ <- log.ftrace("Creating logger for task run") - logger <- Logger(job.id, job.info, config.logBufferSize, logSink) + logger <- QueueLogger(job.id, job.info, config.logBufferSize, logSink) _ <- log.ftrace("Logger created, instantiating context") ctx = create[F, A](job, arg, config, logger, store, blocker) } yield ctx diff --git a/modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala b/modules/joex/src/main/scala/docspell/joex/scheduler/QueueLogger.scala similarity index 84% rename from modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala rename to modules/joex/src/main/scala/docspell/joex/scheduler/QueueLogger.scala index 353c4182..86f2e36e 100644 --- a/modules/joex/src/main/scala/docspell/joex/scheduler/Logger.scala +++ b/modules/joex/src/main/scala/docspell/joex/scheduler/QueueLogger.scala @@ -5,18 +5,7 @@ import cats.effect.{Concurrent, Sync} import docspell.common._ import fs2.concurrent.Queue -trait Logger[F[_]] { - - def trace(msg: => String): F[Unit] - def debug(msg: => String): F[Unit] - def info(msg: => String): F[Unit] - def warn(msg: => String): F[Unit] - def error(ex: Throwable)(msg: => String): F[Unit] - def error(msg: => String): F[Unit] - -} - -object Logger { +object QueueLogger { def create[F[_]: Sync](jobId: Ident, jobInfo: String, q: Queue[F, LogEvent]): Logger[F] = new Logger[F] { diff --git a/modules/microsite/docs/dev/adr.md b/modules/microsite/docs/dev/adr.md index 22481e3f..285571da 100644 --- a/modules/microsite/docs/dev/adr.md +++ b/modules/microsite/docs/dev/adr.md @@ -5,8 +5,14 @@ title: ADRs # ADR -- [0001 Components](adr/0001_components.html) -- [0002 Component Interaction](adr/0002_component_interaction.html) -- [0003 Encryption](adr/0003_encryption.html) -- [0004 ISO8601 vs Unix](adr/0004_iso8601vsEpoch.html) -- [0005 Job Executor](adr/0005_job-executor.html) +- [0001 Components](adr/0001_components) +- [0002 Component Interaction](adr/0002_component_interaction) +- [0003 Encryption](adr/0003_encryption) +- [0004 ISO8601 vs Unix](adr/0004_iso8601vsEpoch) +- [0005 Job Executor](adr/0005_job-executor) +- [0006 More File Types](adr/0006_more-file-types) + - [0007 Convert HTML](adr/0007_convert_html_files) + - [0008 Convert Text](adr/0008_convert_plain_text) + - [0009 Convert Office Files](adr/0009_convert_office_docs) + - [0010 Convert Image Files](adr/0010_convert_image_files) + - [0011 Extract Text](adr/0011_extract_text) diff --git a/modules/microsite/docs/dev/adr/0000_use_markdown_architectural_decision_records.md b/modules/microsite/docs/dev/adr/0000_use_markdown_architectural_decision_records.md index 67186bbc..cc75f776 100644 --- a/modules/microsite/docs/dev/adr/0000_use_markdown_architectural_decision_records.md +++ b/modules/microsite/docs/dev/adr/0000_use_markdown_architectural_decision_records.md @@ -1,3 +1,8 @@ +--- +layout: docs +title: Use Markdown Architectural Decision Records +--- + # Use Markdown Architectural Decision Records ## Context and Problem Statement diff --git a/modules/microsite/docs/dev/adr/0006_more-file-types.md b/modules/microsite/docs/dev/adr/0006_more-file-types.md new file mode 100644 index 00000000..08a7104b --- /dev/null +++ b/modules/microsite/docs/dev/adr/0006_more-file-types.md @@ -0,0 +1,154 @@ +--- +layout: docs +title: More File Types +--- + +# More File Types + +## Context and Problem Statement + +Docspell currently only supports PDF files. This has simplified early +development and design a lot and so helped with starting the project. +Handling pdf files is usually easy (to view, to extract text, print +etc). + +The pdf format has been chosen, because PDFs files are very common and +can be viewed with many tools on many systems (i.e. non-proprietary +tools). Docspell also is a document archive and from this perspective, +it is important that documents can be viewed in 10 years and more. The +hope is, that the PDF format is best suited for this. Therefore all +documents in Docspell must be accessible as PDF. The trivial solution +to this requirement is to only allow PDF files. + +Support for more document types, must then take care of the following: + +- extracting text +- converting into pdf +- access original file + +Text should be extracted from the source file, in case conversion is +not lossless. Since Docspell can already extract text from PDF files +using OCR, text can also be extracted from the converted file as a +fallback. + +The original file must always be accessible. The main reason is that +all uploaded data should be accessible without any modification. And +since the conversion may not always create best results, the original +file should be kept. + + +## Decision Drivers + +People expect that software like Docspell support the most common +document types, like all the “office documents” (`docx`, `rtf`, `odt`, +`xlsx`, …) and images. For many people it is more common to create +those files instead of PDF. Some (older) scanners may not be able to +scan into PDF files but only to image files. + + +## Considered Options + +This ADR does not evaluate different options. It rather documents why +this feature is realized and the thoughts that lead to how it is +implemented. + +## Realization + +### Data Model + +The `attachment` table holds one file. There will be another table +`attachment_source` that holds the original file. It looks like this: + +``` sql +CREATE TABLE "attachment_source" ( + "id" varchar(254) not null primary key, + "file_id" varchar(254) not null, + "filename" varchar(254), + "created" timestamp not null, + foreign key ("file_id") references "filemeta"("id"), + foreign key ("id") references "attachment"("attachid") +); +``` + +The `id` is the primary key and is the same as the associated +`attachment`, creating a `1-1` relationship (well, more correct is +`0..1-1`) between `attachment` and `attachment_source`. + +There will always be a `attachment_source` record for every +`attachment` record. If the original file is a PDF already, then both +table's `file_id` columns point to the same file. But now the user can +change the filename of an `attachment` while the original filename is +preserved in `attachment_source`. It must not be possible for the user +to change anything in `attachment_source`. + +The `attachment` table is not touched in order to keep current code +mostly unchanged and to have a simpler data migration. The downside +is, that the data model allows to have an `attachment` record without +an `attachment_source` record. OTOH, a foreign key inside `attachment` +pointing to an `attachment_source` is also not correct, because it +allows the same `attachment_source` record to be associated with many +`attachment` records. This would do even more harm, in my opinion. + +### Migration + +Creating a new table and not altering existing ones, should simplify +data migration. + +Since only PDF files where allowed and the user could not change +anything in the `attachment` table, the existing data can simply be +inserted into the new table. This presents the trivial case where the +attachment and source are the same. + + +### Processing + +The first step in processing is now converting the file into a pdf. If +it already is a pdf, nothing is done. This step is before text +extraction, so text can first be tried to extract from the source file +and only if that fails (or is not supported), text can be extracted +from the converted pdf file. All remaining steps are untouched. + +If conversion is not supported for the input file, it is skipped. If +conversion fails, the error is propagated to let the retry mechanism +take care. + +#### What types? + +Which file types should be supported? At a first step, all major +office documents, common images, plain text (i.e. markdown) and html +should be supported. In terms of file extensions: `doc`, `docx`, +`xls`, `xlsx`, `odt`, `md`, `html`, `txt`, `jpg`, `png`, `tif`. + +There is always the preference to use jvm internal libraries in order +to be more platform independent and to reduce external dependencies. +But this is not always possible (like doing OCR). + +
+ +
+ +#### Conversion + +- Office documents (`doc`, `docx`, `xls`, `xlsx`, `odt`, `ods`): + unoconv (see [ADR 9](0009_convert_office_docs)) +- HTML (`html`): wkhtmltopdf (see [ADR 7](0007_convert_html_files)) +- Text/Markdown (`txt`, `md`): Java-Lib flexmark + wkhtmltopdf +- Images (`jpg`, `png`, `tif`): Tesseract (see [ADR + 10](0010_convert_image_files)) + +#### Text Extraction + +- Office documents (`doc`, `docx`, `xls`, `xlsx`): Apache Poi +- Office documends (`odt`, `ods`): Apache Tika (including the sources) +- HTML: not supported, extract text from converted PDF +- Images (`jpg`, `png`, `tif`): Tesseract +- Text/Markdown: n.a. +- PDF: Apache PDFBox or Tesseract + +## Links + +* [Convert HTML Files](0007_convert_html_files) +* [Convert Plain Text](0008_convert_plain_text) +* [Convert Office Documents](0009_convert_office_docs) +* [Convert Image Files](0010_convert_image_files) +* [Extract Text from Files](0011_extract_text) diff --git a/modules/microsite/docs/dev/adr/0007_convert_html_files.md b/modules/microsite/docs/dev/adr/0007_convert_html_files.md new file mode 100644 index 00000000..ddc12234 --- /dev/null +++ b/modules/microsite/docs/dev/adr/0007_convert_html_files.md @@ -0,0 +1,71 @@ +--- +layout: docs +title: Convert HTML Files +--- + +# {{ page.title }} + +## Context and Problem Statement + +How can HTML documents be converted into a PDF file that looks as much +as possible like the original? + +It would be nice to have a java-only solution. But if an external tool +has a better outcome, then an external tool is fine, too. + +Since Docspell is free software, the tools must also be free. + + +## Considered Options + +* [pandoc](https://pandoc.org/) external command +* [wkhtmltopdf](https://wkhtmltopdf.org/) external command +* [Unoconv](https://github.com/unoconv/unoconv) external command + +Native (firefox) view: + +
+ +
+ +Note: the example html is from +[here](https://www.sparksuite.com/open-source/invoice.html). + +I downloaded the HTML file to disk together with its resources (using +*Save as...* in the browser). + + +### Pandoc + +
+ +
+ +
+ +
+ +Not showing the version using `context` pdf-engine, since it looked +very similiar to the latex variant. + + +### wkhtmltopdf + +
+ +
+ + +### Unoconv + + +
+ +
+ + +## Decision Outcome + +wkhtmltopdf. + +It shows the best results. diff --git a/modules/microsite/docs/dev/adr/0008_convert_plain_text.md b/modules/microsite/docs/dev/adr/0008_convert_plain_text.md new file mode 100644 index 00000000..0d4a7e75 --- /dev/null +++ b/modules/microsite/docs/dev/adr/0008_convert_plain_text.md @@ -0,0 +1,191 @@ +--- +layout: docs +title: Convert Text Files +--- + +# {{ page.title }} + +## Context and Problem Statement + +How can plain text and markdown documents be converted into a PDF +files? + +Rendering images is not important here, since the files must be self +contained when uploaded to Docspell. + +The test file is the current documentation page of Docspell, found in +`microsite/docs/doc.md`. + +``` +--- +layout: docs +position: 4 +title: Documentation +--- + +# {page .title} + + +Docspell assists in organizing large amounts of PDF files that are +... + +## How it works + +Documents have two ... + +1. You maintain a kind of address book. It should list all possible + correspondents and the concerning people/things. This grows + incrementally with each new unknown document. +2. When docspell analyzes a document, it tries to find matches within + your address ... +3. You can inspect ... + +The set of meta data that docspell uses to draw suggestions from, must +be maintained ... + + +## Terms + +In order to better understand these pages, some terms should be +explained first. + +### Item + +An **Item** is roughly your (pdf) document, only that an item may span +multiple files, which are called **attachments**. And an item has +**meta data** associated: + +- a **correspondent**: the other side of the communication. It can be + an organization or a person. +- a **concerning person** or **equipment**: a person or thing that + this item is about. Maybe it is an insurance contract about your + car. +- ... + +### Collective + +The users of the application are part of a **collective**. A +**collective** is a group of users that share access to the same +items. The account name is therefore comprised of a *collective name* +and a *user name*. + +All users of a collective are equal; they have same permissions to +access all... +``` + +Then a plain text file is tried, too (without any markup). + +``` +Maecenas mauris lectus, lobortis et purus mattis + +Duis vehicula mi vel mi pretium + +In non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. + +Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. +Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros +efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, +ultricies ut purus. Ut facilisis et lacus eu cursus. + +In eleifend velit vitae libero sollicitudin euismod: + +- Fusce vitae vestibulum velit, +- Pellentesque vulputate lectus quis pellentesque commodo + +the end. +``` + + +## Considered Options + +* [flexmark](https://github.com/vsch/flexmark-java) for markdown to + HTML, then use existing machinery described in [adr + 7](./0007_convert_html_files) +* [pandoc](https://pandoc.org/) external command + + +### flexmark markdown library for java + +Process files with [flexmark](https://github.com/vsch/flexmark-java) +and then create a PDF from the resulting html. + +Using the following snippet: + +``` scala +def renderMarkdown(): ExitCode = { + val opts = new MutableDataSet() + opts.set(Parser.EXTENSIONS.asInstanceOf[DataKey[util.Collection[_]]], + util.Arrays.asList(TablesExtension.create(), + StrikethroughExtension.create())); + + val parser = Parser.builder(opts).build() + val renderer = HtmlRenderer.builder(opts).build() + val reader = Files.newBufferedReader(Paths.get("in.txt|md")) + val doc = parser.parseReader(reader) + val html = renderer.render(doc) + val body = "" + html + "" + Files.write( + Paths.get("test.html"), + body.getBytes(StandardCharsets.UTF_8)) + + ExitCode.Success + } +``` + +Then run the result through `wkhtmltopdf`. + +Markdown file: +
+ +
+ +TXT file: +
+ +
+ + +### pandoc + +Command: + +``` +pandoc -f markdown -t html -o test.pdf microsite/docs/doc.md +``` + +Markdown/Latex: +
+ +
+ +Markdown/Html: +
+ +
+ +Text/Latex: +
+ +
+ +Text/Html: +
+ +
+ + +## Decision Outcome + +Java library "flexmark". + +I think all results are great. It depends on the type of document and +what one expects to see. I guess that most people expect something +like pandoc-html produces for the kind of files docspell is for (it is +not for newspaper articles, where pandoc-latex would be best fit). + +But choosing pandoc means yet another external command to depend on. +And the results from flexmark are really good, too. One can fiddle +with options and css to make it look better. + +To not introduce another external command, decision is to use flexmark +and then the already existing html->pdf conversion. diff --git a/modules/microsite/docs/dev/adr/0009_convert_office_docs.md b/modules/microsite/docs/dev/adr/0009_convert_office_docs.md new file mode 100644 index 00000000..1ee089e5 --- /dev/null +++ b/modules/microsite/docs/dev/adr/0009_convert_office_docs.md @@ -0,0 +1,231 @@ +--- +layout: docs +title: Convert Office Documents +--- + +# {{ page.title }} + +## Context and Problem Statement + +How can office documents, like `docx` or `odt` be converted into a PDF +file that looks as much as possible like the original? + +It would be nice to have a java-only solution. But if an external tool +has a better outcome, then an external tool is fine, too. + +Since Docspell is free software, the tools must also be free. + +## Considered Options + +* [Apache POI](https://poi.apache.org) together with + [this](https://search.maven.org/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.pdf/1.0.6/jar) + library +* [pandoc](https://pandoc.org/) external command +* [abiword]() external command +* [Unoconv](https://github.com/unoconv/unoconv) external command + +To choose an option, some documents are converted to pdf and compared. +Only the formats `docx` and `odt` are considered here. These are the +most used formats. They have to look well, if a `xlsx` or `pptx` +doesn't look so great, that is ok. + +Here is the native view to compare with: + +ODT: + +
+ +
+ +### `XWPFConverter` + +I couldn't get any example to work. There were exceptions: + +``` +java.lang.IllegalArgumentException: Value for parameter 'id' was out of bounds + at org.apache.poi.util.IdentifierManager.reserve(IdentifierManager.java:80) + at org.apache.poi.xwpf.usermodel.XWPFRun.(XWPFRun.java:101) + at org.apache.poi.xwpf.usermodel.XWPFRun.(XWPFRun.java:146) + at org.apache.poi.xwpf.usermodel.XWPFParagraph.buildRunsInOrderFromXml(XWPFParagraph.java:135) + at org.apache.poi.xwpf.usermodel.XWPFParagraph.(XWPFParagraph.java:88) + at org.apache.poi.xwpf.usermodel.XWPFDocument.onDocumentRead(XWPFDocument.java:147) + at org.apache.poi.POIXMLDocument.load(POIXMLDocument.java:159) + at org.apache.poi.xwpf.usermodel.XWPFDocument.(XWPFDocument.java:124) + at docspell.convert.Testing$.withPoi(Testing.scala:17) + at docspell.convert.Testing$.$anonfun$run$1(Testing.scala:12) + at cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:87) + at cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:355) + at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:376) + at cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:316) + at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36) + at cats.effect.internals.PoolUtils$$anon$2$$anon$3.run(PoolUtils.scala:51) + at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) + at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) + at java.lang.Thread.run(Thread.java:748) +``` + +The project (not Apache Poi, the other) seems unmaintained. I could +not find any website and the artifact in maven central is from 2016. + + +### Pandoc + +I know pandoc as a very great tool when converting between markup +documents. So this tries it with office documents. It supports `docx` +and `odt` from there `--list-input-formats`. + +From the pandoc manual: + +> By default, pandoc will use LaTeX to create the PDF, which requires +> that a LaTeX engine be installed (see --pdf-engine below). +> Alternatively, pandoc can use ConTeXt, roff ms, or HTML as an +> intermediate format. To do this, specify an output file with a .pdf +> extension, as before, but add the --pdf-engine option or -t context, +> -t html, or -t ms to the command line. The tool used to generate the +> PDF from the intermediate format may be specified using --pdf-engine. + +Trying with latex engine: + +``` +pandoc -f odt -o test.pdf example.odt +``` + +Results ODT: + +
+ +
+ + +``` +pandoc -f odt -o test.pdf example.docx +``` + +Results DOCX: + +
+ +
+ + +---- + +Trying with context engine: + +``` +pandoc -f odt -t context -o test.pdf example.odt +``` + +Results ODT: + +
+ +
+ + +Results DOCX: + +
+ +
+ + +---- + +Trying with ms engine: + +``` +pandoc -f odt -t ms -o test.pdf example.odt +``` + +Results ODT: + +
+ +
+ +Results DOCX: + +
+ +
+ + +--- + +Trying with html engine (this requires `wkhtmltopdf` to be present): + +``` +$ pandoc --extract-media . -f odt -t html -o test.pdf example.odt +``` + +Results ODT: + +
+ +
+ +Results DOCX: + +
+ +
+ + +### Abiword + +Trying with: + +``` +abiword --to=pdf example.odt +``` + +Results: + +
+ +
+ + +Trying with a `docx` file failed. It worked with a `doc` file. + + +### Unoconv + +Unoconv relies on libreoffice/openoffice, so installing it will result +in installing parts of libreoffice, which is a very large dependency. + +Trying with: + +``` +unoconv -f pdf example.odt +``` + +Results ODT: + +
+ +
+ +Results DOCX: + +
+ +
+ +## Decision Outcome + +Unoconv. + +The results from `unoconv` are really good. + +Abiword also is not that bad, it didn't convert the chart, but all +font markup is there. It would be great to not depend on something as +big as libreoffice, but the results are so much better. + +Also pandoc deals very well with DOCX files (using the `context` +engine). The only thing that was not rendered was the embedded chart +(like abiword). But all images and font styling was present. + +It will be a configurable external command anyways, so users can +exchange it at any time with a different one. diff --git a/modules/microsite/docs/dev/adr/0010_convert_image_files.md b/modules/microsite/docs/dev/adr/0010_convert_image_files.md new file mode 100644 index 00000000..bf8e16d2 --- /dev/null +++ b/modules/microsite/docs/dev/adr/0010_convert_image_files.md @@ -0,0 +1,192 @@ +--- +layout: docs +title: Convert Image Files +--- + +# {{ page.title }} + +## Context and Problem Statement + +How to convert image files properly to pdf? + +Since there are thousands of different image formats, there will never +be support for all. The most common containers should be supported, +though: + +- jpeg (jfif, exif) +- png +- tiff (baseline, single page) + +The focus is on document images, maybe from digital cameras or +scanners. + +## Considered Options + +* [pdfbox]() library +* [imagemagick](https://www.imagemagick.org/) external command +* [img2pdf](https://github.com/josch/img2pdf) external command +* [tesseract](https://github.com/tesseract-ocr/tesseract) external command + +There are no screenshots here, because it doesn't make sense since +they all look the same on the screen. Instead we look at the files +properties. + +**Input File** + +The input files are: + +``` +$ identify input/* +input/jfif.jpg JPEG 2480x3514 2480x3514+0+0 8-bit sRGB 240229B 0.000u 0:00.000 +input/letter-en.jpg JPEG 1695x2378 1695x2378+0+0 8-bit Gray 256c 467341B 0.000u 0:00.000 +input/letter-en.png PNG 1695x2378 1695x2378+0+0 8-bit Gray 256c 191571B 0.000u 0:00.000 +input/letter-en.tiff TIFF 1695x2378 1695x2378+0+0 8-bit Grayscale Gray 4030880B 0.000u 0:00.000 +``` + +Size: +- jfif.jpg 240k +- letter-en.jpg 467k +- letter-en.png 191k +- letter-en.tiff 4.0M + +### pdfbox + +Using a java library is preferred, if the quality is good enough. +There is an +[example](https://github.com/apache/pdfbox/blob/2cea31cc63623fd6ece149c60d5f0cc05a696ea7/examples/src/main/java/org/apache/pdfbox/examples/pdmodel/ImageToPDF.java) +for this exact use case. + +This is the sample code: + +``` scala +def imgtopdf(file: String): ExitCode = { + val jpg = Paths.get(file).toAbsolutePath + if (!Files.exists(jpg)) { + sys.error(s"file doesn't exist: $jpg") + } + val pd = new PDDocument() + val page = new PDPage(PDRectangle.A4) + pd.addPage(page) + val bimg = ImageIO.read(jpg.toFile) + + val img = LosslessFactory.createFromImage(pd, bimg) + + val stream = new PDPageContentStream(pd, page) + stream.drawImage(img, 0, 0, PDRectangle.A4.getWidth, PDRectangle.A4.getHeight) + stream.close() + + pd.save("test.pdf") + pd.close() + + ExitCode.Success +} +``` + +Using pdfbox 2.0.18 and twelvemonkeys 3.5. Running time: `1384ms` + +``` +$ identify *.pdf +jfif.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 129660B 0.000u 0:00.000 +letter-en.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000 +letter-en.png.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000 +letter-en.tiff.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49118B 0.000u 0:00.000 +``` + +Size: +- jfif.jpg 1.1M +- letter-en.jpg 142k +- letter-en.png 142k +- letter-en.tiff 142k + +### img2pdf + +This is a python tool that adds the image into the pdf without +reencoding. + +Using version 0.3.1. Running time: `323ms`. + +``` +$ identify *.pdf +jfif.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 129708B 0.000u 0:00.000 +letter-en.jpg.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000 +letter-en.png.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000 +letter-en.tiff.pdf PDF 595x842 595x842+0+0 16-bit sRGB 49864B 0.000u 0:00.000 +``` + +Size: +- jfif.jpg 241k +- letter-en.jpg 468k +- letter-en.png 191k +- letter-en.tiff 192k + +### ImageMagick + +The well known imagemagick tool can convert images to pdfs, too. + +Using version 6.9.10-71. Running time: `881ms`. + +``` +$ identify *.pdf +jfif.jpg.pdf PDF 595x843 595x843+0+0 16-bit sRGB 134873B 0.000u 0:00.000 +letter-en.jpg.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 360100B 0.000u 0:00.000 +letter-en.png.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 322418B 0.000u 0:00.000 +letter-en.tiff.pdf PDF 1695x2378 1695x2378+0+0 16-bit sRGB 322418B 0.000u 0:00.000 +``` + +Size: +- jfif.jpg 300k +- letter-en.jpg 390k +- letter-en.png 180k +- letter-en.tiff 5.1M + + +### Tesseract + +Docspell already relies on tesseract for doing OCR. And in contrast to +all other candidates, it can create PDFs that are searchable. Of +course, this yields in much longer running time, that cannot be +compared to the times of the other options. + +``` +tesseract doc3.jpg out -l deu pdf +``` + +It can also create both outputs in one go: + +``` +tesseract doc3.jpg out -l deu pdf txt +``` + +Using tesseract 4. Running time: `6661ms` + +``` +$ identify *.pdf +tesseract/jfif.jpg.pdf PDF 595x843 595x843+0+0 16-bit sRGB 130535B 0.000u 0:00.000 +tesseract/letter-en.jpg.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000 +tesseract/letter-en.png.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000 +tesseract/letter-en.tiff.pdf PDF 1743x2446 1743x2446+0+0 16-bit sRGB 328716B 0.000u 0:00.000 +``` + +Size: +- jfif.jpg 246k +- letter-en.jpg 473k +- letter-en.png 183k +- letter-en.tiff 183k + + +## Decision + +Tesseract. + +To not use more external tools, imagemagick and img2pdf are not +chosen, even though img2pdf shows the best results and is fastest. + +Pdfbox library would be the favorite, because results are good and +with the [twelvemonkeys](https://github.com/haraldk/TwelveMonkeys) +library there is support for many images. The priority is to avoid +more external commands if possible. + +But since there already is a dependency to tesseract and it can create +searchable pdfs, the decision is to use tesseract for this. Then PDFs +with images can be converted to searchable PDFs with images. And text +extraction is required anyways. diff --git a/modules/microsite/docs/dev/adr/0011_extract_text.md b/modules/microsite/docs/dev/adr/0011_extract_text.md new file mode 100644 index 00000000..c90736b6 --- /dev/null +++ b/modules/microsite/docs/dev/adr/0011_extract_text.md @@ -0,0 +1,77 @@ +--- +layout: docs +title: Extract Text from Files +--- + +# Extract Text from Files + +## Context and Problem Statement + +With support for more file types there must be a way to extract text +from all of them. It is better to extract text from the source files, +in contrast to extracting the text from the converted pdf file. + +There are multiple options and multiple file types. Again, most +priority is to use a java/scala library to reduce external +dependencies. + +## Considered Options + +### MS Office Documents + +There is only one library I know: [Apache +POI](https://poi.apache.org/). It supports `doc(x)` and `xls(x)`. +However, it doesn't support open-document format (odt and ods). + +### OpenDocument Format + +There are two libraries: + +- [Apache Tika Parser](https://tika.apache.org/) +- [ODFToolkit](https://github.com/tdf/odftoolkit) + +*Tika:* The tika-parsers package contains an opendocument parser for +extracting text. But it has a huge dependency tree, since it is a +super-package containing a parser for almost every common file type. + +*ODF Toolkit:* This depends on [Apache Jena](https://jena.apache.org) +and also pulls in quite some dependencies (while not as much as +tika-parser). It is not too bad, since it is a library for +manipulating opendocument files. But all I need is to only extract +text. I created tests that extracted text from my odt/ods files. It +worked at first sight, but running the tests in a loop resulted in +strange nullpointer exceptions (it only worked the first run). + +### Richtext + +Richtext is supported by the jdk (using `RichtextEditorKit` from +swing). + +### PDF + +For "image" pdf files, tesseract is used. For "text" PDF files, the +library [Apache PDFBox](https://pdfbox.apache.org) can be used. + +There also is [iText](https://github.com/itext/itext7) with a AGPL +license. + +### Images + +For images and "image" PDF files, there is already tesseract in place. + +### HTML + +HTML must be converted into a PDF file before text can be extracted. + +### Text/Markdown + +These files can be used as-is, obviously. + + +## Decision Outcome + +- MS Office files: POI library +- Open Document files: Tika, but integrating the few source files that + make up the open document parser. Due to its huge dependency tree, + the library is not added. +- PDF: Apache PDFBox. I know this library better than itext. diff --git a/modules/microsite/docs/dev/adr/img/example-docx-pandoc-context.jpg b/modules/microsite/docs/dev/adr/img/example-docx-pandoc-context.jpg new file mode 100644 index 00000000..fdbbeed3 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-docx-pandoc-context.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-docx-pandoc-html.jpg b/modules/microsite/docs/dev/adr/img/example-docx-pandoc-html.jpg new file mode 100644 index 00000000..3e22ecee Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-docx-pandoc-html.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-docx-pandoc-latex.jpg b/modules/microsite/docs/dev/adr/img/example-docx-pandoc-latex.jpg new file mode 100644 index 00000000..fe42eedf Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-docx-pandoc-latex.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-docx-pandoc-ms.jpg b/modules/microsite/docs/dev/adr/img/example-docx-pandoc-ms.jpg new file mode 100644 index 00000000..50766cf7 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-docx-pandoc-ms.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-docx-unoconv.jpg b/modules/microsite/docs/dev/adr/img/example-docx-unoconv.jpg new file mode 100644 index 00000000..7acf7c4d Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-docx-unoconv.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-html-native.jpg b/modules/microsite/docs/dev/adr/img/example-html-native.jpg new file mode 100644 index 00000000..91ba500f Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-html-native.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-html-pandoc-html.jpg b/modules/microsite/docs/dev/adr/img/example-html-pandoc-html.jpg new file mode 100644 index 00000000..79235243 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-html-pandoc-html.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-html-pandoc-latex.jpg b/modules/microsite/docs/dev/adr/img/example-html-pandoc-latex.jpg new file mode 100644 index 00000000..0c6cc22f Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-html-pandoc-latex.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-html-unoconv.jpg b/modules/microsite/docs/dev/adr/img/example-html-unoconv.jpg new file mode 100644 index 00000000..3d4d0f4e Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-html-unoconv.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-html-wkhtmltopdf.jpg b/modules/microsite/docs/dev/adr/img/example-html-wkhtmltopdf.jpg new file mode 100644 index 00000000..e7e6fe56 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-html-wkhtmltopdf.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-md-java.jpg b/modules/microsite/docs/dev/adr/img/example-md-java.jpg new file mode 100644 index 00000000..f65e3538 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-md-java.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-md-pandoc-html.jpg b/modules/microsite/docs/dev/adr/img/example-md-pandoc-html.jpg new file mode 100644 index 00000000..28429746 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-md-pandoc-html.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-md-pandoc-latex.jpg b/modules/microsite/docs/dev/adr/img/example-md-pandoc-latex.jpg new file mode 100644 index 00000000..6e7be587 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-md-pandoc-latex.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-odt-abiword.jpg b/modules/microsite/docs/dev/adr/img/example-odt-abiword.jpg new file mode 100644 index 00000000..94fa1f69 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-odt-abiword.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-odt-native.jpg b/modules/microsite/docs/dev/adr/img/example-odt-native.jpg new file mode 100644 index 00000000..18a0a416 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-odt-native.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-odt-pandoc-context.jpg b/modules/microsite/docs/dev/adr/img/example-odt-pandoc-context.jpg new file mode 100644 index 00000000..609868fa Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-odt-pandoc-context.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-odt-pandoc-html.jpg b/modules/microsite/docs/dev/adr/img/example-odt-pandoc-html.jpg new file mode 100644 index 00000000..780683c6 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-odt-pandoc-html.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-odt-pandoc-latex.jpg b/modules/microsite/docs/dev/adr/img/example-odt-pandoc-latex.jpg new file mode 100644 index 00000000..d2f43957 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-odt-pandoc-latex.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-odt-pandoc-ms.jpg b/modules/microsite/docs/dev/adr/img/example-odt-pandoc-ms.jpg new file mode 100644 index 00000000..fedf8d2f Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-odt-pandoc-ms.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-odt-unoconv.jpg b/modules/microsite/docs/dev/adr/img/example-odt-unoconv.jpg new file mode 100644 index 00000000..e1a1ea22 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-odt-unoconv.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-txt-java.jpg b/modules/microsite/docs/dev/adr/img/example-txt-java.jpg new file mode 100644 index 00000000..3434ea18 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-txt-java.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-txt-pandoc-html.jpg b/modules/microsite/docs/dev/adr/img/example-txt-pandoc-html.jpg new file mode 100644 index 00000000..c46e5ebf Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-txt-pandoc-html.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/example-txt-pandoc-latex.jpg b/modules/microsite/docs/dev/adr/img/example-txt-pandoc-latex.jpg new file mode 100644 index 00000000..fa25a7d4 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/example-txt-pandoc-latex.jpg differ diff --git a/modules/microsite/docs/dev/adr/img/process-files.png b/modules/microsite/docs/dev/adr/img/process-files.png new file mode 100644 index 00000000..455b1a13 Binary files /dev/null and b/modules/microsite/docs/dev/adr/img/process-files.png differ diff --git a/modules/microsite/docs/dev/adr/process-files.puml b/modules/microsite/docs/dev/adr/process-files.puml new file mode 100644 index 00000000..2c5330cd --- /dev/null +++ b/modules/microsite/docs/dev/adr/process-files.puml @@ -0,0 +1,43 @@ +@startuml +scale 1200 width +title: Processing Files +skinparam monochrome true +skinparam backgroundColor white +skinparam rectangle { + roundCorner<> 25 + roundCorner<> 5 +} +rectangle Input <> { + file "html" + file "plaintext" + file "image" + file "msoffice" + file "rtf" + file "odf" + file "pdf" +} + +node toBoth [ + PDF + TXT +] +node toPdf [ + PDF +] +node toTxt [ + TXT +] + +image --> toBoth: +html --> toPdf: +toPdf --> toTxt:[pdfbox] +plaintext --> html:[flexmark] +msoffice --> toPdf: +msoffice --> toTxt:[poi] +rtf --> toTxt:[jdk] +rtf --> toPdf: +odf --> toTxt:[tika] +odf --> toPdf: +pdf --> toTxt: +pdf --> toTxt:[pdfbox] +plaintext -> toTxt:[identity] +@enduml diff --git a/modules/microsite/docs/doc.md b/modules/microsite/docs/doc.md index a0927223..6932df00 100644 --- a/modules/microsite/docs/doc.md +++ b/modules/microsite/docs/doc.md @@ -6,6 +6,8 @@ title: Documentation # {{page.title}} +This is the documentation for Docspell @VERSION@. + Docspell assists in organizing large amounts of PDF files that are typically scanned paper documents. You can associate tags, set correspondends, what a document is concerned with, a name, a date and @@ -38,6 +40,10 @@ be maintained manually. But usually, this data doesn't grow as fast as the documents. After a while there is a quite complete address book and only once in a while it has to be revisited. +Besides extracting the text from documents to analyze, docspell also +converts all files into PDF files. This unifies the different formats +your documents may be in originally and makes them more accessible +from other systems and the future. ## Terms @@ -68,7 +74,7 @@ multiple files, which are called **attachments**. And an item has - a **direction**: one of "incoming" or "outgoing" - a **name**: some item name, defaults to the file name of the attachments -- some **notes**: arbitraty descriptive text. You can use markdown +- some **notes**: arbitrary descriptive text. You can use markdown here, which is appropriately formatted in the web application. ### Collective diff --git a/modules/microsite/docs/doc/install.md b/modules/microsite/docs/doc/install.md index af4ba903..6f085d53 100644 --- a/modules/microsite/docs/doc/install.md +++ b/modules/microsite/docs/doc/install.md @@ -68,19 +68,28 @@ component. extraction](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality) (at the expense of a longer runtime). - [Tesseract](https://github.com/tesseract-ocr/tesseract) is the tool - doing the OCR (converts images into text). It is a widely used open - source OCR engine. Tesseract 3 and 4 should work with docspell; you - can adopt the command line in the configuration file, if necessary. + doing the OCR (converts images into text). It can also convert + images into pdf files. It is a widely used open source OCR engine. + Tesseract 3 and 4 should work with docspell; you can adopt the + command line in the configuration file, if necessary. +- [Unoconv](https://github.com/unoconv/unoconv) is used to convert + office documents into PDF files. It uses libreoffice/openoffice. +- [wkhtmltopdf](https://wkhtmltopdf.org/) is used to convert HTML into + PDF files. +The performance of `unoconv` can be improved by starting `unoconv -l` +in a separate process. This runs a libreoffice/openoffice listener +therefore avoids starting one each time `unoconv` is called. ### Example Debian On Debian this should install all joex requirements: ``` bash -sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper +sudo apt-get install ghostscript tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper unoconv wkhtmltopdf ``` + ## Database Both components must have access to a SQL database. Docspell has @@ -203,12 +212,15 @@ work is done by the joex components. ### Joex Running the joex component on the Raspberry Pi is possible, but will -result in long processing times. Tested on a RPi model 3 (4 cores, 1G -RAM) processing a PDF (scanned with 300dpi) with two pages took -9:52. You can speed it up considerably by uninstalling the `unpaper` -command, because this step takes quite long. This, of course, reduces -the quality of OCR. But without `unpaper` the same sample pdf was then -processed in 1:24, a speedup of 8 minutes. +result in long processing times for OCR. Files that don't require OCR +are no problem. + +Tested on a RPi model 3 (4 cores, 1G RAM) processing a PDF (scanned +with 300dpi) with two pages took 9:52. You can speed it up +considerably by uninstalling the `unpaper` command, because this step +takes quite long. This, of course, reduces the quality of OCR. But +without `unpaper` the same sample pdf was then processed in 1:24, a +speedup of 8 minutes. You should limit the joex pool size to 1 and, depending on your model and the amount of RAM, set a heap size of at least 500M diff --git a/modules/microsite/docs/doc/reverseproxy.md b/modules/microsite/docs/doc/reverseproxy.md new file mode 100644 index 00000000..17f42127 --- /dev/null +++ b/modules/microsite/docs/doc/reverseproxy.md @@ -0,0 +1,98 @@ +--- +layout: docs +title: Reverse Proxy +permalink: doc/reverseproxy +--- + +# {{ page.title }} + +This contains examples for how to use docspell behind a reverse proxy. + +For the examples below, assume the following: + +- Docspell app is available at `192.168.1.11:7880`. If it is running + on the same machine as the reverse proxy server, you can set + `localhost:7880` instead. +- The external domain/hostname is `docspell.example.com` + +## Configuring Docspell + +These settings require a complement config part in the docspell +configuration file: + +- First, if Docspell REST server is on a different machine, you need + to change the `bind.address` setting to be either `0.0.0.0` or the + ip address of the network interface that the reverse proxy server + connects to. + ``` + docspell.server { + # Where the server binds to. + bind { + address = "192.168.1.11" + port = 7880 + } + } + ``` + Note that a value of `0.0.0.0` instead of `192.168.1.11` will bind + the server to every network interface. +- Docspell needs to know the external url. The `base-url` setting + must point to the external address. Using above values, it must be + set to `https://docspell.example.com`. + ``` + docspell.server { + # This is the base URL this application is deployed to. This is used + # to create absolute URLs and to configure the cookie. + base-url = "https://docspell.example.com" + ... + } + ``` + +Note that this example assumes that the docspell-joex component is on +the same machine. This page is only related for exposing the REST +server and web application. + +If you have examples for more servers, please let me know or add it to +this site. + +## Nginx + +This defines two servers: one listens for http traffic and redirects +to the https variant. Additionally it defines the let's encrypt +`.well-known` folder name. + +The https server endpoint is configured with the let's encrypt +certificates and acts as a proxy for the application at +`192.168.1.11:7880`. + +``` +server { + listen 0.0.0.0:80 ; + listen [::]:80 ; + server_name docspell.example.com ; + location /.well-known/acme-challenge { + root /var/data/nginx/ACME-PUBLIC; + auth_basic off; + } + location / { + return 301 https://$host$request_uri; + } +} +server { + listen 0.0.0.0:443 ssl http2 ; + listen [::]:443 ssl http2 ; + server_name docspell.example.com ; + location /.well-known/acme-challenge { + root /var/data/nginx/ACME-PUBLIC; + auth_basic off; + } + ssl_certificate /var/lib/acme/docspell.example.com/fullchain.pem; + ssl_certificate_key /var/lib/acme/docspell.example.com/key.pem; + ssl_trusted_certificate /var/lib/acme/docspell.example.com/full.pem; + location / { + proxy_pass http://192.168.1.11:7880; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + } +} +``` diff --git a/modules/microsite/docs/features.md b/modules/microsite/docs/features.md index 5e9ec6ba..c5d60643 100644 --- a/modules/microsite/docs/features.md +++ b/modules/microsite/docs/features.md @@ -9,6 +9,7 @@ title: Features and Limitations - Multiple users per account - Handle multiple documents as one unit - OCR using [tesseract](https://github.com/tesseract-ocr/tesseract) +- Conversion to PDF: all files are converted into a PDF file - Text is analysed to find and attach meta data automatically - Manage document processing (cancel jobs, set priorities) - Everything available via a documented [REST Api](api) @@ -18,6 +19,14 @@ title: Features and Limitations - REST server and document processing are separate applications which can be scaled-out independently - Everything stored in a SQL database: PostgreSQL, MariaDB or H2 +- Files supported: + - PDF + - common MS Office (doc, docx, xls, xlsx) + - OpenDocument (odt, ods) + - RichText (rtf) + - Images (jpg, png, tiff) + - HTML + - text/* (treated as Markdown) - Tools: - Watch a folder: watch folders for changes and send files to docspell - Firefox plugin: right click on a link and send the file to docspell @@ -31,7 +40,6 @@ These are current known limitations that may be of interest for considering docspell at the moment. Hopefully they will be resolved eventually…. -- Only PDF files possible for now. - No fulltext search implemented. This currently has very low priority, because I myself never needed it. Open an issue if you find it important. diff --git a/modules/microsite/docs/getit.md b/modules/microsite/docs/getit.md index 0d533269..19c2721f 100644 --- a/modules/microsite/docs/getit.md +++ b/modules/microsite/docs/getit.md @@ -18,11 +18,20 @@ You need to download the two files: ## Prerequisite Install Java (use your package manager or look -[here](https://adoptopenjdk.net/)), -[tesseract](https://github.com/tesseract-ocr/tesseract), -[ghostscript](http://pages.cs.wisc.edu/~ghost/) and possibly -[unpaper](https://github.com/Flameeyes/unpaper). The last is not -really required, but improves OCR. +[here](https://adoptopenjdk.net/)). + +OCR functionality requires the following tools: + +- [tesseract](https://github.com/tesseract-ocr/tesseract), +- [ghostscript](http://pages.cs.wisc.edu/~ghost/) and possibly +- [unpaper](https://github.com/Flameeyes/unpaper). + +The last is not really required, but improves OCR. + +PDF conversion requires the following tools: + +- [unoconv](https://github.com/unoconv/unoconv) +- [wkhtmltopdf](https://wkhtmltopdf.org/) ## Running diff --git a/modules/microsite/src/main/resources/microsite/data/menu.yml b/modules/microsite/src/main/resources/microsite/data/menu.yml index 69d5299f..4446063d 100644 --- a/modules/microsite/src/main/resources/microsite/data/menu.yml +++ b/modules/microsite/src/main/resources/microsite/data/menu.yml @@ -21,6 +21,9 @@ options: - title: Nix/NixOS url: doc/nix + - title: Reverse Proxy + url: doc/reverseproxy + - title: Configuring url: doc/configure.html diff --git a/modules/restapi/src/main/resources/docspell-openapi.yml b/modules/restapi/src/main/resources/docspell-openapi.yml index c40d586c..4b38a930 100644 --- a/modules/restapi/src/main/resources/docspell-openapi.yml +++ b/modules/restapi/src/main/resources/docspell-openapi.yml @@ -1172,7 +1172,8 @@ paths: tags: [ Attachment ] summary: Get an attachment file. description: | - Get the binary file belonging to the attachment with the given id. + Get information about the binary file belonging to the + attachment with the given id. security: - authTokenHeader: [] parameters: @@ -1198,7 +1199,60 @@ paths: tags: [ Attachment ] summary: Get an attachment file. description: | - Get the binary file belonging to the attachment with the given id. + Get the binary file belonging to the attachment with the given + id. + security: + - authTokenHeader: [] + parameters: + - $ref: "#/components/parameters/id" + responses: + 200: + description: Ok + content: + application/octet-stream: + schema: + type: string + format: binary + /sec/attachment/{id}/original: + head: + tags: [ Attachment ] + summary: Get an attachment file. + description: | + Get information about the original binary file of the + attachment with the given id. + + If the attachment is a converted PDF file, this route gets the + original file as it was uploaded. + security: + - authTokenHeader: [] + parameters: + - $ref: "#/components/parameters/id" + responses: + 200: + description: Ok + headers: + Content-Type: + schema: + type: string + Content-Length: + schema: + type: integer + format: int64 + ETag: + schema: + type: string + Content-Disposition: + schema: + type: string + get: + tags: [ Attachment ] + summary: Get an attachment file. + description: | + Get the original binary file of the attachment with the given + id. + + If the attachment is a converted PDF file, this route gets the + original file as it was uploaded. security: - authTokenHeader: [] parameters: @@ -1790,6 +1844,7 @@ components: - created - updated - attachments + - sources - tags properties: id: @@ -1836,6 +1891,10 @@ components: type: array items: $ref: "#/components/schemas/Attachment" + sources: + type: array + items: + $ref: "#/components/schemas/AttachmentSource" tags: type: array items: @@ -1847,6 +1906,7 @@ components: - id - size - contentType + - converted properties: id: type: string @@ -1859,6 +1919,29 @@ components: contentType: type: string format: mimetype + converted: + type: boolean + AttachmentSource: + description: | + The source or original file of an attachment. + required: + - id + - size + - contentType + properties: + id: + type: string + format: ident + description: | + The id is the attachment id. + name: + type: string + size: + type: integer + format: int64 + contentType: + type: string + format: mimetype Registration: description: | Data for registering a new account. diff --git a/modules/restserver/src/main/resources/reference.conf b/modules/restserver/src/main/resources/reference.conf index 182bb0e4..4e165dc5 100644 --- a/modules/restserver/src/main/resources/reference.conf +++ b/modules/restserver/src/main/resources/reference.conf @@ -80,9 +80,10 @@ docspell.server { # The file content types that are considered valid. Docspell # will only pass these files to processing. The processing code # itself has also checks for which files are supported and which - # not. This affects the uploading part and is a first check to - # avoid that 'bad' files get into the system. - valid-mime-types = [ "application/pdf" ] + # not. This affects the uploading part and can be used to + # restrict file types that should be handed over to processing. + # By default all files are allowed. + valid-mime-types = [ ] } } } \ No newline at end of file diff --git a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala index a7e3db2e..a018e2ac 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/conv/Conversions.scala @@ -84,12 +84,18 @@ trait Conversions { data.inReplyTo.map(mkIdName), data.item.dueDate, data.item.notes, - data.attachments.map((mkAttachment _).tupled).toList, + data.attachments.map((mkAttachment(data)_).tupled).toList, + data.sources.map((mkAttachmentSource _).tupled).toList, data.tags.map(mkTag).toList ) - def mkAttachment(ra: RAttachment, m: FileMeta): Attachment = - Attachment(ra.id, ra.name, m.length, MimeType.unsafe(m.mimetype.asString)) + def mkAttachment(item: OItem.ItemData)(ra: RAttachment, m: FileMeta): Attachment = { + val converted = item.sources.find(_._1.id == ra.id).exists(_._2.checksum != m.checksum) + Attachment(ra.id, ra.name, m.length, MimeType.unsafe(m.mimetype.asString), converted) + } + + def mkAttachmentSource(ra: RAttachmentSource, m: FileMeta): AttachmentSource = + AttachmentSource(ra.id, ra.name, m.length, MimeType.unsafe(m.mimetype.asString)) // item list diff --git a/modules/restserver/src/main/scala/docspell/restserver/routes/AttachmentRoutes.scala b/modules/restserver/src/main/scala/docspell/restserver/routes/AttachmentRoutes.scala index 49b27268..4537fa88 100644 --- a/modules/restserver/src/main/scala/docspell/restserver/routes/AttachmentRoutes.scala +++ b/modules/restserver/src/main/scala/docspell/restserver/routes/AttachmentRoutes.scala @@ -1,5 +1,6 @@ package docspell.restserver.routes +import bitpeace.FileMeta import cats.data.NonEmptyList import cats.effect._ import cats.implicits._ @@ -22,13 +23,13 @@ object AttachmentRoutes { val dsl = new Http4sDsl[F] {} import dsl._ - def withResponseHeaders(resp: F[Response[F]])(data: OItem.AttachmentData[F]): F[Response[F]] = { + def withResponseHeaders(resp: F[Response[F]])(data: OItem.BinaryData[F]): F[Response[F]] = { val mt = MediaType.unsafeParse(data.meta.mimetype.asString) val ctype = `Content-Type`(mt) val cntLen: Header = `Content-Length`.unsafeFromLong(data.meta.length) val eTag: Header = ETag(data.meta.checksum) val disp: Header = - `Content-Disposition`("inline", Map("filename" -> data.ra.name.getOrElse(""))) + `Content-Disposition`("inline", Map("filename" -> data.name.getOrElse(""))) resp.map(r => if (r.status == NotModified) r.withHeaders(ctype, eTag, disp) @@ -36,7 +37,7 @@ object AttachmentRoutes { ) } - def makeByteResp(data: OItem.AttachmentData[F]): F[Response[F]] = + def makeByteResp(data: OItem.BinaryData[F]): F[Response[F]] = withResponseHeaders(Ok(data.data.take(data.meta.length)))(data) HttpRoutes.of { @@ -52,7 +53,7 @@ object AttachmentRoutes { for { fileData <- backend.item.findAttachment(id, user.account.collective) inm = req.headers.get(`If-None-Match`).flatMap(_.tags) - matches = matchETag(fileData, inm) + matches = matchETag(fileData.map(_.meta), inm) resp <- fileData .map({ data => if (matches) withResponseHeaders(NotModified())(data) @@ -61,6 +62,27 @@ object AttachmentRoutes { .getOrElse(NotFound(BasicResult(false, "Not found"))) } yield resp + case HEAD -> Root / Ident(id) / "original" => + for { + fileData <- backend.item.findAttachmentSource(id, user.account.collective) + resp <- fileData + .map(data => withResponseHeaders(Ok())(data)) + .getOrElse(NotFound(BasicResult(false, "Not found"))) + } yield resp + + case req @ GET -> Root / Ident(id) / "original" => + for { + fileData <- backend.item.findAttachmentSource(id, user.account.collective) + inm = req.headers.get(`If-None-Match`).flatMap(_.tags) + matches = matchETag(fileData.map(_.meta), inm) + resp <- fileData + .map({ data => + if (matches) withResponseHeaders(NotModified())(data) + else makeByteResp(data) + }) + .getOrElse(NotFound(BasicResult(false, "Not found"))) + } yield resp + case GET -> Root / Ident(id) / "view" => // this route exists to provide a stable url // it redirects currently to viewerjs @@ -78,12 +100,12 @@ object AttachmentRoutes { } private def matchETag[F[_]]( - fileData: Option[OItem.AttachmentData[F]], - noneMatch: Option[NonEmptyList[EntityTag]] + fileData: Option[FileMeta], + noneMatch: Option[NonEmptyList[EntityTag]] ): Boolean = (fileData, noneMatch) match { - case (Some(fd), Some(nm)) => - fd.meta.checksum == nm.head.tag + case (Some(meta), Some(nm)) => + meta.checksum == nm.head.tag case _ => false } diff --git a/modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql b/modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql new file mode 100644 index 00000000..6f93ca0c --- /dev/null +++ b/modules/store/src/main/resources/db/migration/mariadb/V1.2.0__origin_source.sql @@ -0,0 +1,11 @@ +CREATE TABLE `attachment_source` ( + `id` varchar(254) not null primary key, + `file_id` varchar(254) not null, + `filename` varchar(254), + `created` timestamp not null, + foreign key (`file_id`) references `filemeta`(`id`), + foreign key (`id`) references `attachment`(`attachid`) +); + +INSERT INTO `attachment_source` + SELECT `attachid`,`filemetaid`,`name`,`created` FROM `attachment`; diff --git a/modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql b/modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql new file mode 100644 index 00000000..630ea05d --- /dev/null +++ b/modules/store/src/main/resources/db/migration/postgresql/V1.2.0__origin_source.sql @@ -0,0 +1,11 @@ +CREATE TABLE "attachment_source" ( + "id" varchar(254) not null primary key, + "file_id" varchar(254) not null, + "filename" varchar(254), + "created" timestamp not null, + foreign key ("file_id") references "filemeta"("id"), + foreign key ("id") references "attachment"("attachid") +); + +INSERT INTO "attachment_source" + SELECT "attachid","filemetaid","name","created" FROM "attachment"; diff --git a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala index 9a652df0..8e300370 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QAttachment.scala @@ -8,28 +8,32 @@ import doobie.implicits._ import docspell.common.{Ident, MetaProposalList} import docspell.store.Store import docspell.store.impl.Implicits._ -import docspell.store.records.{RAttachment, RAttachmentMeta, RItem} +import docspell.store.records.{RAttachment, RAttachmentMeta, RAttachmentSource, RItem} object QAttachment { def deleteById[F[_]: Sync](store: Store[F])(attachId: Ident, coll: Ident): F[Int] = for { - raOpt <- store.transact(RAttachment.findByIdAndCollective(attachId, coll)) - n <- raOpt.traverse(_ => store.transact(RAttachment.delete(attachId))) - f <- Stream - .emit(raOpt) - .unNoneTerminate - .map(_.fileId.id) + raFile <- store.transact(RAttachment.findByIdAndCollective(attachId, coll)).map(_.map(_.fileId)) + rsFile <- store.transact(RAttachmentSource.findByIdAndCollective(attachId, coll)).map(_.map(_.fileId)) + n <- store.transact(RAttachment.delete(attachId)) + f <- Stream.emits(raFile.toSeq ++ rsFile.toSeq) + .map(_.id) .flatMap(store.bitpeace.delete) + .map(flag => if (flag) 1 else 0) .compile - .last - } yield n.getOrElse(0) + f.map(_ => 1).getOrElse(0) + .foldMonoid + } yield n + f def deleteAttachment[F[_]: Sync](store: Store[F])(ra: RAttachment): F[Int] = for { + s <- store.transact(RAttachmentSource.findById(ra.id)) n <- store.transact(RAttachment.delete(ra.id)) - f <- Stream.emit(ra.fileId.id).flatMap(store.bitpeace.delete).compile.last - } yield n + f.map(_ => 1).getOrElse(0) + f <- Stream.emits(ra.fileId.id +: s.map(_.fileId.id).toSeq). + flatMap(store.bitpeace.delete). + map(flag => if (flag) 1 else 0). + compile.foldMonoid + } yield n + f def deleteItemAttachments[F[_]: Sync](store: Store[F])(itemId: Ident, coll: Ident): F[Int] = for { diff --git a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala index 28440b4b..70b32650 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QCollective.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QCollective.scala @@ -14,7 +14,6 @@ object QCollective { def getInsights(coll: Ident): ConnectionIO[InsightData] = { val IC = RItem.Columns - val AC = RAttachment.Columns val TC = RTag.Columns val RC = RTagItem.Columns val q0 = selectCount( @@ -28,12 +27,21 @@ object QCollective { and(IC.cid.is(coll), IC.incoming.is(Direction.outgoing)) ).query[Int].unique - val q2 = fr"SELECT sum(m.length) FROM" ++ RItem.table ++ fr"i" ++ - fr"INNER JOIN" ++ RAttachment.table ++ fr"a ON" ++ AC.itemId - .prefix("a") - .is(IC.id.prefix("i")) ++ - fr"INNER JOIN filemeta m ON m.id =" ++ AC.fileId.prefix("a").f ++ - fr"WHERE" ++ IC.cid.is(coll) + + val fileSize = sql""" + select sum(length) from ( + with attachs as + (select a.attachid as aid, a.filemetaid as fid + from attachment a + inner join item i on a.itemid = i.itemid + where i.cid = $coll) + select a.fid,m.length from attachs a + inner join filemeta m on m.id = a.fid + union distinct + select a.file_id,m.length from attachment_source a + inner join filemeta m on m.id = a.file_id where a.id in (select aid from attachs) + ) as t""".query[Option[Long]].unique + val q3 = fr"SELECT" ++ commas( TC.name.prefix("t").f, @@ -47,9 +55,9 @@ object QCollective { for { n0 <- q0 n1 <- q1 - n2 <- q2.query[Option[Long]].unique + n2 <- fileSize n3 <- q3.query[(String, Int)].to[Vector] - } yield InsightData(n0, n1, n2.getOrElse(0), Map.from(n3)) + } yield InsightData(n0, n1, n2.getOrElse(0L), Map.from(n3)) } def getContacts( diff --git a/modules/store/src/main/scala/docspell/store/queries/QItem.scala b/modules/store/src/main/scala/docspell/store/queries/QItem.scala index 1927c01d..21cd0d75 100644 --- a/modules/store/src/main/scala/docspell/store/queries/QItem.scala +++ b/modules/store/src/main/scala/docspell/store/queries/QItem.scala @@ -23,7 +23,8 @@ object QItem { concEquip: Option[REquipment], inReplyTo: Option[IdRef], tags: Vector[RTag], - attachments: Vector[(RAttachment, FileMeta)] + attachments: Vector[(RAttachment, FileMeta)], + sources: Vector[(RAttachmentSource, FileMeta)] ) { def filterCollective(coll: Ident): Option[ItemData] = @@ -69,14 +70,16 @@ object QItem { ] .option val attachs = RAttachment.findByItemWithMeta(id) + val sources = RAttachmentSource.findByItemWithMeta(id) val tags = RTag.findByItem(id) for { data <- q att <- attachs + srcs <- sources ts <- tags - } yield data.map(d => ItemData(d._1, d._2, d._3, d._4, d._5, d._6, ts, att)) + } yield data.map(d => ItemData(d._1, d._2, d._3, d._4, d._5, d._6, ts, att, srcs)) } case class ListItem( diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala index ee193e69..728abc95 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachment.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachment.scala @@ -38,9 +38,26 @@ object RAttachment { fr"${v.id},${v.itemId},${v.fileId.id},${v.position},${v.created},${v.name}" ).update.run + def updateFileIdAndName(attachId: Ident, fId: Ident, fname: Option[String]): ConnectionIO[Int] = + updateRow(table, id.is(attachId), commas(fileId.setTo(fId), name.setTo(fname))).update.run + def findById(attachId: Ident): ConnectionIO[Option[RAttachment]] = selectSimple(all, table, id.is(attachId)).query[RAttachment].option + def findMeta(attachId: Ident): ConnectionIO[Option[FileMeta]] = { + import bitpeace.sql._ + + val cols = RFileMeta.Columns.all.map(_.prefix("m")) + val aId = id.prefix("a") + val aFileMeta = fileId.prefix("a") + val mId = RFileMeta.Columns.id.prefix("m") + + val from = table ++ fr"a INNER JOIN" ++ RFileMeta.table ++ fr"m ON" ++ aFileMeta.is(mId) + val cond = aId.is(attachId) + + selectSimple(cols, from, cond).query[FileMeta].option + } + def findByIdAndCollective(attachId: Ident, collective: Ident): ConnectionIO[Option[RAttachment]] = selectSimple( all.map(_.prefix("a")), @@ -94,7 +111,8 @@ object RAttachment { def delete(attachId: Ident): ConnectionIO[Int] = for { n0 <- RAttachmentMeta.delete(attachId) - n1 <- deleteFrom(table, id.is(attachId)).update.run - } yield n0 + n1 + n1 <- RAttachmentSource.delete(attachId) + n2 <- deleteFrom(table, id.is(attachId)).update.run + } yield n0 + n1 + n2 } diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala index f1887399..9de923e2 100644 --- a/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentMeta.scala @@ -8,11 +8,16 @@ import docspell.store.impl._ import docspell.store.impl.Implicits._ case class RAttachmentMeta( - id: Ident, + id: Ident, //same as RAttachment.id content: Option[String], nerlabels: List[NerLabel], proposals: MetaProposalList -) {} +) { + + def setContentIfEmpty(txt: Option[String]): RAttachmentMeta = + if (content.forall(_.trim.isEmpty)) copy(content = txt) + else this +} object RAttachmentMeta { def empty(attachId: Ident) = RAttachmentMeta(attachId, None, Nil, MetaProposalList.empty) diff --git a/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala new file mode 100644 index 00000000..cf90f2d3 --- /dev/null +++ b/modules/store/src/main/scala/docspell/store/records/RAttachmentSource.scala @@ -0,0 +1,84 @@ +package docspell.store.records + +import bitpeace.FileMeta +import doobie._ +import doobie.implicits._ +import docspell.common._ +import docspell.store.impl._ +import docspell.store.impl.Implicits._ + +/** The origin file of an attachment. The `id` is shared with the + * attachment, to create a 1-1 (or 0..1-1) relationship. + */ +case class RAttachmentSource( + id: Ident, //same as RAttachment.id + fileId: Ident, + name: Option[String], + created: Timestamp +) + +object RAttachmentSource { + + val table = fr"attachment_source" + + object Columns { + val id = Column("id") + val fileId = Column("file_id") + val name = Column("filename") + val created = Column("created") + + val all = List(id, fileId, name, created) + } + + import Columns._ + + def of(ra: RAttachment): RAttachmentSource = + RAttachmentSource(ra.id, ra.fileId, ra.name, ra.created) + + def insert(v: RAttachmentSource): ConnectionIO[Int] = + insertRow(table, all, fr"${v.id},${v.fileId},${v.name},${v.created}").update.run + + + def findById(attachId: Ident): ConnectionIO[Option[RAttachmentSource]] = + selectSimple(all, table, id.is(attachId)).query[RAttachmentSource].option + + def delete(attachId: Ident): ConnectionIO[Int] = + deleteFrom(table, id.is(attachId)).update.run + + def findByIdAndCollective(attachId: Ident, collective: Ident): ConnectionIO[Option[RAttachmentSource]] = { + val bId = RAttachment.Columns.id.prefix("b") + val aId = Columns.id.prefix("a") + val bItem = RAttachment.Columns.itemId.prefix("b") + val iId = RItem.Columns.id.prefix("i") + val iColl = RItem.Columns.cid.prefix("i") + + val from = table ++ fr"a INNER JOIN" ++ + RAttachment.table ++ fr"b ON" ++ aId.is(bId) ++ + fr"INNER JOIN" ++ RItem.table ++ fr"i ON" ++ bItem.is(iId) + + val where = and(aId.is(attachId), bId.is(attachId), iColl.is(collective)) + + selectSimple(all.map(_.prefix("a")), from, where).query[RAttachmentSource].option + } + + def findByItemWithMeta(id: Ident): ConnectionIO[Vector[(RAttachmentSource, FileMeta)]] = { + import bitpeace.sql._ + + val aId = Columns.id.prefix("a") + val afileMeta = fileId.prefix("a") + val bPos = RAttachment.Columns.position.prefix("b") + val bId = RAttachment.Columns.id.prefix("b") + val bItem = RAttachment.Columns.itemId.prefix("b") + val mId = RFileMeta.Columns.id.prefix("m") + + val cols = all.map(_.prefix("a")) ++ RFileMeta.Columns.all.map(_.prefix("m")) + val from = table ++ fr"a INNER JOIN" ++ + RFileMeta.table ++ fr"m ON" ++ afileMeta.is(mId) ++ fr"INNER JOIN" ++ + RAttachment.table ++ fr"b ON" ++ aId.is(bId) + val where = bItem.is(id) + + (selectSimple(cols, from, where) ++ orderBy(bPos.asc)). + query[(RAttachmentSource, FileMeta)].to[Vector] + } + +} diff --git a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala index daa81029..1749a653 100644 --- a/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala +++ b/modules/store/src/main/scala/docspell/store/records/RFileMeta.scala @@ -1,7 +1,12 @@ package docspell.store.records +import bitpeace.FileMeta +import doobie._ import doobie.implicits._ + +import docspell.common._ import docspell.store.impl._ +import docspell.store.impl.Implicits._ object RFileMeta { @@ -19,4 +24,10 @@ object RFileMeta { val all = List(id, timestamp, mimetype, length, checksum, chunks, chunksize) } + + def findById(fid: Ident): ConnectionIO[Option[FileMeta]] = { + import bitpeace.sql._ + + selectSimple(Columns.all, table, Columns.id.is(fid)).query[FileMeta].option + } } diff --git a/modules/text/src/main/scala/docspell/text/ocr/Config.scala b/modules/text/src/main/scala/docspell/text/ocr/Config.scala deleted file mode 100644 index f2f8e5d1..00000000 --- a/modules/text/src/main/scala/docspell/text/ocr/Config.scala +++ /dev/null @@ -1,66 +0,0 @@ -package docspell.text.ocr - -import java.nio.file.{Path, Paths} - -import docspell.common._ - -case class Config( - allowedContentTypes: Set[MimeType], - ghostscript: Config.Ghostscript, - pageRange: Config.PageRange, - unpaper: Config.Unpaper, - tesseract: Config.Tesseract -) { - - def isAllowed(mt: MimeType): Boolean = - allowedContentTypes contains mt -} - -object Config { - case class PageRange(begin: Int) - - case class Command(program: String, args: Seq[String], timeout: Duration) { - - def mapArgs(f: String => String): Command = - Command(program, args.map(f), timeout) - - def toCmd: List[String] = - program :: args.toList - - lazy val cmdString: String = - toCmd.mkString(" ") - } - - case class Ghostscript(command: Command, workingDir: Path) - case class Tesseract(command: Command) - case class Unpaper(command: Command) - - val default = Config( - allowedContentTypes = Set( - MimeType.pdf, - MimeType.png, - MimeType.jpeg, - MimeType.tiff - ), - pageRange = PageRange(10), - ghostscript = Ghostscript( - Command( - "gs", - Seq( - "-dNOPAUSE", - "-dBATCH", - "-dSAFER", - "-sDEVICE=tiffscaled8", - "-sOutputFile={{outfile}}", - "{{infile}}" - ), - Duration.seconds(30) - ), - Paths.get(System.getProperty("java.io.tmpdir")).resolve("docspell-extraction") - ), - unpaper = Unpaper(Command("unpaper", Seq("{{infile}}", "{{outfile}}"), Duration.seconds(30))), - tesseract = Tesseract( - Command("tesseract", Seq("{{file}}", "stdout", "-l", "{{lang}}"), Duration.minutes(1)) - ) - ) -} diff --git a/modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala b/modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala deleted file mode 100644 index 23c39f16..00000000 --- a/modules/text/src/main/scala/docspell/text/ocr/MimeTypeHint.scala +++ /dev/null @@ -1,7 +0,0 @@ -package docspell.text.ocr - -case class MimeTypeHint(filename: Option[String], advertised: Option[String]) {} - -object MimeTypeHint { - val none = MimeTypeHint(None, None) -} diff --git a/modules/text/src/test/scala/docspell/text/TestFiles.scala b/modules/text/src/test/scala/docspell/text/TestFiles.scala deleted file mode 100644 index a33bf4b9..00000000 --- a/modules/text/src/test/scala/docspell/text/TestFiles.scala +++ /dev/null @@ -1,96 +0,0 @@ -package docspell.text - -import cats.effect.{Blocker, IO} -import docspell.common.LenientUri -import fs2.Stream - -import scala.concurrent.ExecutionContext - -object TestFiles { - val blocker = Blocker.liftExecutionContext(ExecutionContext.global) - implicit val CS = IO.contextShift(ExecutionContext.global) - - val letterSourceDE: Stream[IO, Byte] = - LenientUri - .fromJava(getClass.getResource("/letter-de-source.pdf")) - .readURL[IO](16 * 1024, blocker) - - val letterSourceEN: Stream[IO, Byte] = - LenientUri - .fromJava(getClass.getResource("/letter-en-source.pdf")) - .readURL[IO](16 * 1024, blocker) - - val letterDEText = - """Max Mustermann - | - |Lilienweg 21 - | - |12345 Nebendorf - | - |E-Mail: max.muster@gmail.com - | - |Max Mustermann, Lilienweg 21, 12345 Nebendorf - | - |EasyCare AG - |Abteilung Buchhaltung - |Ackerweg 12 - | - |12346 Ulmen - | - |Nebendorf, 3. September 2019 - |Sehr geehrte Damen und Herren, - | - |hiermit kündige ich meine Mitgliedschaft in der Kranken- und Pflegeversicherung zum - |nächstmöglichen Termin. - | - |Bitte senden Sie mir innerhalb der gesetzlichen Frist von 14 Tagen eine Kündigungsbe- - |stätigung zu. - | - |Vielen Dank im Vorraus! - | - |Mit freundlichen Grüßen - | - |Max Mustermann - |""".stripMargin.trim - - val letterENText = - """Derek Jeter - | - |123 Elm Ave. - | - |Treesville, ON MI1N 2P3 - |November 7, 2016 - | - |Derek Jeter, 123 Elm Ave., Treesville, ON M1N 2P3, November 7, 2016 - | - |Mr. M. Leat - | - |Chief of Syrup Production - |Old Sticky Pancake Company - |456 Maple Lane - | - |Forest, ON 7TW8 9Y0 - | - |Hemptown, September 3, 2019 - |Dear Mr. Leaf, - | - |Let me begin by thanking you for your past contributions to our Little League baseball - |team. Your sponsorship aided in the purchase of ten full uniforms and several pieces of - |baseball equipment for last year’s season. - | - |Next month, our company is planning an employee appreciation pancake breakfast hon- - |oring retired employees for their past years of service and present employees for their - |loyalty and dedication in spite of the current difficult economic conditions. - | - |We would like to place an order with your company for 25 pounds of pancake mix and - |five gallons of maple syrup. We hope you will be able to provide these products in the - |bulk quantities we require. - | - |As you are a committed corporate sponsor and long-time associate, we hope that you - |will be able to join us for breakfast on December 12, 2016. - | - |Respectfully yours, - | - |Derek Jeter - |""".stripMargin.trim -} diff --git a/modules/webapp/src/main/elm/App/View.elm b/modules/webapp/src/main/elm/App/View.elm index f3451644..5a9cd00d 100644 --- a/modules/webapp/src/main/elm/App/View.elm +++ b/modules/webapp/src/main/elm/App/View.elm @@ -252,6 +252,19 @@ loginInfo model = , text "New Invites" ] , div [ class "divider" ] [] + , a + [ class "icon item" + , href "https://eikek.github.io/docspell/doc" + , target "_new" + , title "Opens https://eikek.github.io/docspell/doc" + ] + [ i [ class "help icon" ] [] + , text "Help" + , span [ class "ui right floated" ] + [ i [ class "external link icon" ] [] + ] + ] + , div [ class "divider" ] [] , a [ class "icon item" , href "" diff --git a/modules/webapp/src/main/elm/Comp/Dropzone.elm b/modules/webapp/src/main/elm/Comp/Dropzone.elm index 3ef12443..e551209e 100644 --- a/modules/webapp/src/main/elm/Comp/Dropzone.elm +++ b/modules/webapp/src/main/elm/Comp/Dropzone.elm @@ -35,7 +35,7 @@ type alias Settings = defaultSettings : Settings defaultSettings = { classList = \_ -> [ ( "ui placeholder segment", True ) ] - , contentTypes = [ "application/pdf" ] + , contentTypes = [] } @@ -148,7 +148,11 @@ filterMime settings files = pred f = List.member (File.mime f) settings.contentTypes in - List.filter pred files + if settings.contentTypes == [] then + files + + else + List.filter pred files dropDecoder : D.Decoder Msg diff --git a/modules/webapp/src/main/elm/Comp/FixedDropdown.elm b/modules/webapp/src/main/elm/Comp/FixedDropdown.elm new file mode 100644 index 00000000..586dff32 --- /dev/null +++ b/modules/webapp/src/main/elm/Comp/FixedDropdown.elm @@ -0,0 +1,108 @@ +module Comp.FixedDropdown exposing + ( Item + , Model + , Msg + , init + , initMap + , initString + , initTuple + , update + , view + ) + +import Html exposing (..) +import Html.Attributes exposing (..) +import Html.Events exposing (onClick) + + +type alias Item a = + { id : a + , display : String + } + + +type alias Model a = + { options : List (Item a) + , menuOpen : Bool + } + + +type Msg a + = SelectItem (Item a) + | ToggleMenu + + +init : List (Item a) -> Model a +init options = + { options = options + , menuOpen = False + } + + +initString : List String -> Model String +initString strings = + init <| List.map (\s -> Item s s) strings + + +initMap : (a -> String) -> List a -> Model a +initMap elToString els = + init <| List.map (\a -> Item a (elToString a)) els + + +initTuple : List ( String, a ) -> Model a +initTuple tuples = + let + mkItem ( txt, id ) = + Item id txt + in + init <| List.map mkItem tuples + + +update : Msg a -> Model a -> ( Model a, Maybe a ) +update msg model = + case msg of + ToggleMenu -> + ( { model | menuOpen = not model.menuOpen }, Nothing ) + + SelectItem item -> + ( model, Just item.id ) + + +view : Maybe (Item a) -> Model a -> Html (Msg a) +view selected model = + div + [ classList + [ ( "ui selection dropdown", True ) + , ( "open", model.menuOpen ) + ] + , onClick ToggleMenu + ] + [ input [ type_ "hidden" ] [] + , i [ class "dropdown icon" ] [] + , div + [ classList + [ ( "default", selected == Nothing ) + , ( "text", True ) + ] + ] + [ Maybe.map .display selected + |> Maybe.withDefault "Select…" + |> text + ] + , div + [ classList + [ ( "menu transition", True ) + , ( "hidden", not model.menuOpen ) + , ( "visible", model.menuOpen ) + ] + ] + <| + List.map renderItems model.options + ] + + +renderItems : Item a -> Html (Msg a) +renderItems item = + div [ class "item", onClick (SelectItem item) ] + [ text item.display + ] diff --git a/modules/webapp/src/main/elm/Comp/ItemDetail.elm b/modules/webapp/src/main/elm/Comp/ItemDetail.elm index 9c0bc7c6..1fde18a2 100644 --- a/modules/webapp/src/main/elm/Comp/ItemDetail.elm +++ b/modules/webapp/src/main/elm/Comp/ItemDetail.elm @@ -40,6 +40,7 @@ import Http import Markdown import Page exposing (Page(..)) import Util.Http +import Util.List import Util.Maybe import Util.Size import Util.String @@ -1133,15 +1134,16 @@ renderAttachmentsTabMenu model = div [ class "ui top attached tabular menu" ] (List.indexedMap (\pos -> - \a -> - div + \el -> + a [ classList [ ( "item", True ) , ( "active", pos == model.visibleAttach ) ] + , title (Maybe.withDefault "No Name" el.name) , onClick (SetActiveAttachment pos) ] - [ Maybe.map (Util.String.ellipsis 20) a.name + [ Maybe.map (Util.String.ellipsis 20) el.name |> Maybe.withDefault "No Name" |> text ] @@ -1176,6 +1178,29 @@ renderAttachmentView model pos attach = ] , div [ class "right menu" ] [ a + [ classList + [ ( "item", True ) + , ( "disabled", not attach.converted ) + ] + , title + (if attach.converted then + case Util.List.find (\s -> s.id == attach.id) model.item.sources of + Just src -> + "Goto original: " + ++ Maybe.withDefault "" src.name + + Nothing -> + "Goto original file" + + else + "This is the original file" + ) + , href (fileUrl ++ "/original") + , target "_new" + ] + [ i [ class "external square alternate icon" ] [] + ] + , a [ classList [ ( "toggle item", True ) , ( "active", isAttachMetaOpen model attach.id ) @@ -1188,7 +1213,7 @@ renderAttachmentView model pos attach = ] , a [ class "item" - , title "Download to disk" + , title "Download PDF to disk" , download attachName , href fileUrl ] @@ -1258,14 +1283,20 @@ renderItemInfo : Model -> Html Msg renderItemInfo model = let date = - div [ class "item" ] + div + [ class "item" + , title "Item Date" + ] [ Maybe.withDefault model.item.created model.item.itemDate |> Util.Time.formatDate |> text ] duedate = - div [ class "item" ] + div + [ class "item" + , title "Due Date" + ] [ i [ class "bell icon" ] [] , Maybe.map Util.Time.formatDate model.item.dueDate |> Maybe.withDefault "" @@ -1273,7 +1304,10 @@ renderItemInfo model = ] corr = - div [ class "item" ] + div + [ class "item" + , title "Correspondent" + ] [ i [ class "envelope outline icon" ] [] , List.filterMap identity [ model.item.corrOrg, model.item.corrPerson ] |> List.map .name @@ -1283,7 +1317,10 @@ renderItemInfo model = ] conc = - div [ class "item" ] + div + [ class "item" + , title "Concerning" + ] [ i [ class "comment outline icon" ] [] , List.filterMap identity [ model.item.concPerson, model.item.concEquipment ] |> List.map .name @@ -1293,13 +1330,22 @@ renderItemInfo model = ] src = - div [ class "item" ] + div + [ class "item" + , title "Source" + ] [ text model.item.source ] in div [ class "ui fluid container" ] - (h2 [ class "ui header" ] - [ i [ class (Data.Direction.iconFromString model.item.direction) ] [] + (h2 + [ class "ui header" + ] + [ i + [ class (Data.Direction.iconFromString model.item.direction) + , title model.item.direction + ] + [] , div [ class "content" ] [ text model.item.name , div @@ -1479,29 +1525,13 @@ renderEditForm model = [ i [ class "tiny edit icon" ] [] , div [ class "content" ] [ text "Notes" - , div [ class "sub header" ] - [ a - [ class "ui link" - , target "_blank" - , href "https://guides.github.com/features/mastering-markdown" - ] - [ text "Markdown" - ] - , text " is supported" - ] ] ] , div [ class "field" ] - [ div [ class "ui action input" ] - [ textarea - [ rows 6 - , autocomplete False - , onInput SetNotes - , Maybe.withDefault "" model.notesModel |> value - ] - [] - , button [ class "ui icon button", onClick ToggleEditNotes ] - [ i [ class "save outline icon" ] [] + [ div [ class "ui input" ] + [ button [ class "ui basic primary fluid button", onClick ToggleEditNotes ] + [ i [ class "edit outline icon" ] [] + , text "Toggle Notes Form" ] ] ] diff --git a/modules/webapp/src/main/elm/Comp/SourceForm.elm b/modules/webapp/src/main/elm/Comp/SourceForm.elm index d0ba0a4d..73e7ee17 100644 --- a/modules/webapp/src/main/elm/Comp/SourceForm.elm +++ b/modules/webapp/src/main/elm/Comp/SourceForm.elm @@ -9,7 +9,7 @@ module Comp.SourceForm exposing ) import Api.Model.Source exposing (Source) -import Comp.Dropdown +import Comp.FixedDropdown import Data.Flags exposing (Flags) import Data.Priority exposing (Priority) import Html exposing (..) @@ -21,7 +21,8 @@ type alias Model = { source : Source , abbrev : String , description : Maybe String - , priority : Comp.Dropdown.Model Priority + , priorityModel : Comp.FixedDropdown.Model Priority + , priority : Priority , enabled : Bool } @@ -31,13 +32,11 @@ emptyModel = { source = Api.Model.Source.empty , abbrev = "" , description = Nothing - , priority = - Comp.Dropdown.makeSingleList - { makeOption = \p -> { text = Data.Priority.toName p, value = Data.Priority.toName p } - , placeholder = "" - , options = Data.Priority.all - , selected = Nothing - } + , priorityModel = + Comp.FixedDropdown.initMap + Data.Priority.toName + Data.Priority.all + , priority = Data.Priority.Low , enabled = False } @@ -57,11 +56,7 @@ getSource model = | abbrev = model.abbrev , description = model.description , enabled = model.enabled - , priority = - Comp.Dropdown.getSelected model.priority - |> List.head - |> Maybe.map Data.Priority.toName - |> Maybe.withDefault s.priority + , priority = Data.Priority.toName model.priority } @@ -70,7 +65,7 @@ type Msg | SetSource Source | SetDescr String | ToggleEnabled - | PrioDropdownMsg (Comp.Dropdown.Msg Priority) + | PrioDropdownMsg (Comp.FixedDropdown.Msg Priority) update : Flags -> Msg -> Model -> ( Model, Cmd Msg ) @@ -95,12 +90,8 @@ update _ msg model = , abbrev = t.abbrev , description = t.description , priority = - Comp.Dropdown.makeSingleList - { makeOption = \p -> { text = Data.Priority.toName p, value = Data.Priority.toName p } - , placeholder = "" - , options = Data.Priority.all - , selected = Data.Priority.fromString t.priority - } + Data.Priority.fromString t.priority + |> Maybe.withDefault Data.Priority.Low , enabled = t.enabled } , Cmd.none @@ -126,14 +117,25 @@ update _ msg model = PrioDropdownMsg m -> let - ( m2, c2 ) = - Comp.Dropdown.update m model.priority + ( m2, p2 ) = + Comp.FixedDropdown.update m model.priorityModel in - ( { model | priority = m2 }, Cmd.map PrioDropdownMsg c2 ) + ( { model + | priorityModel = m2 + , priority = Maybe.withDefault model.priority p2 + } + , Cmd.none + ) view : Flags -> Model -> Html Msg view flags model = + let + priorityItem = + Comp.FixedDropdown.Item + model.priority + (Data.Priority.toName model.priority) + in div [ class "ui form" ] [ div [ classList @@ -171,7 +173,11 @@ view flags model = ] , div [ class "field" ] [ label [] [ text "Priority" ] - , Html.map PrioDropdownMsg (Comp.Dropdown.view model.priority) + , Html.map PrioDropdownMsg + (Comp.FixedDropdown.view + (Just priorityItem) + model.priorityModel + ) ] , urlInfoMessage flags model ] diff --git a/nix/module-joex.nix b/nix/module-joex.nix index 516947fa..9c39651f 100644 --- a/nix/module-joex.nix +++ b/nix/module-joex.nix @@ -30,30 +30,74 @@ let wakeup-period = "30 minutes"; }; extraction = { - page-range = { - begin = 10; + pdf = { + min-text-len = 10; }; - ghostscript = { - working-dir = "/tmp/docspell-extraction"; - command = { - program = "${pkgs.ghostscript}/bin/gs"; - args = [ "-dNOPAUSE" "-dBATCH" "-dSAFER" "-sDEVICE=tiffscaled8" "-sOutputFile={{outfile}}" "{{infile}}" ]; - timeout = "5 minutes"; + + ocr = { + max-image-size = 14000000; + page-range = { + begin = 10; + }; + ghostscript = { + working-dir = "/tmp/docspell-extraction"; + command = { + program = "${pkgs.ghostscript}/bin/gs"; + args = [ "-dNOPAUSE" "-dBATCH" "-dSAFER" "-sDEVICE=tiffscaled8" "-sOutputFile={{outfile}}" "{{infile}}" ]; + timeout = "5 minutes"; + }; + }; + unpaper = { + command = { + program = "${pkgs.unpaper}/bin/unpaper"; + args = [ "{{infile}}" "{{outfile}}" ]; + timeout = "5 minutes"; + }; + }; + tesseract = { + command= { + program = "${pkgs.tesseract4}/bin/tesseract"; + args = ["{{file}}" "stdout" "-l" "{{lang}}" ]; + timeout = "5 minutes"; + }; }; }; - unpaper = { - command = { - program = "${pkgs.unpaper}/bin/unpaper"; - args = [ "{{infile}}" "{{outfile}}" ]; - timeout = "5 minutes"; - }; + }; + convert = { + chunk-size = 524288; + max-image-size = 14000000; + + markdown = { + internal-css = '' + body { padding: 2em 5em; } + ''; }; + + wkhtmlpdf = { + command = { + program = "${pkgs.wkhtmltopdf}/bin/wkhtmltopdf"; + args = ["-s" "A4" "--encoding" "UTF-8" "-" "{{outfile}}"]; + timeout = "2 minutes"; + }; + working-dir = "/tmp/docspell-convert"; + }; + tesseract = { - command= { + command = { program = "${pkgs.tesseract4}/bin/tesseract"; - args = ["{{file}}" "stdout" "-l" "{{lang}}" ]; + args = ["{{infile}}" "out" "-l" "{{lang}}" "pdf" "txt"]; timeout = "5 minutes"; }; + working-dir = "/tmp/docspell-convert"; + }; + + unoconv = { + command = { + program = "${pkgs.unoconv}/bin/unoconv"; + args = ["-f" "pdf" "-o" "{{outfile}}" "{{infile}}"]; + timeout = "2 minutes"; + }; + working-dir = "/tmp/docspell-convert"; }; }; }; @@ -199,128 +243,164 @@ in { extraction = mkOption { type = types.submodule({ options = { - page-range = mkOption { + pdf = mkOption { type = types.submodule({ options = { - begin = mkOption { + min-text-len = mkOption { type = types.int; - default = defaults.extraction.page-range.begin; - description = "Specifies the first N pages of a file to process."; + default = defaults.extraction.pdf.min-text-len; + description = '' + For PDF files it is first tried to read the text parts of the + PDF. But PDFs can be complex documents and they may contain text + and images. If the returned text is shorter than the value + below, OCR is run afterwards. Then both extracted texts are + compared and the longer will be used. + ''; }; }; }); - default = defaults.extraction.page-range; - description = '' - Defines what pages to process. If a PDF with 600 pages is - submitted, it is probably not necessary to scan through all of - them. This would take a long time and occupy resources for no - value. The first few pages should suffice. The default is first - 10 pages. + default = defaults.extraction.pdf; + description = "Settings for PDF extraction"; + }; + ocr = mkOption { + type = types.submodule({ + options = { + max-image-size = mkOption { + type = types.int; + default = defaults.extraction.ocr.max-image-size; + description = '' + Images greater than this size are skipped. Note that every + image is loaded completely into memory for doing OCR. + ''; + }; + page-range = mkOption { + type = types.submodule({ + options = { + begin = mkOption { + type = types.int; + default = defaults.extraction.page-range.begin; + description = "Specifies the first N pages of a file to process."; + }; + }; + }); + default = defaults.extraction.page-range; + description = '' + Defines what pages to process. If a PDF with 600 pages is + submitted, it is probably not necessary to scan through all of + them. This would take a long time and occupy resources for no + value. The first few pages should suffice. The default is first + 10 pages. - If you want all pages being processed, set this number to -1. + If you want all pages being processed, set this number to -1. - Note: if you change the ghostscript command below, be aware that - this setting (if not -1) will add another parameter to the - beginning of the command. - ''; - }; - ghostscript = mkOption { - type = types.submodule({ - options = { - working-dir = mkOption { - type = types.str; - default = defaults.extraction.ghostscript.working-dir; - description = "Directory where the extraction processes can put their temp files"; + Note: if you change the ghostscript command below, be aware that + this setting (if not -1) will add another parameter to the + beginning of the command. + ''; }; - command = mkOption { + ghostscript = mkOption { type = types.submodule({ options = { - program = mkOption { + working-dir = mkOption { type = types.str; - default = defaults.extraction.ghostscript.command.program; - description = "The path to the executable."; + default = defaults.extraction.ghostscript.working-dir; + description = "Directory where the extraction processes can put their temp files"; }; - args = mkOption { - type = types.listOf types.str; - default = defaults.extraction.ghostscript.command.args; - description = "The arguments to the program"; - }; - timeout = mkOption { - type = types.str; - default = defaults.extraction.ghostscript.command.timeout; - description = "The timeout when executing the command"; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.extraction.ghostscript.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.extraction.ghostscript.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.extraction.ghostscript.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.extraction.ghostscript.command; + description = "The system command"; }; }; }); - default = defaults.extraction.ghostscript.command; - description = "The system command"; + default = defaults.extraction.ghostscript; + description = "The ghostscript command."; }; - }; - }); - default = defaults.extraction.ghostscript; - description = "The ghostscript command."; - }; - unpaper = mkOption { - type = types.submodule({ - options = { - command = mkOption { + unpaper = mkOption { type = types.submodule({ options = { - program = mkOption { - type = types.str; - default = defaults.extraction.unpaper.command.program; - description = "The path to the executable."; - }; - args = mkOption { - type = types.listOf types.str; - default = defaults.extraction.unpaper.command.args; - description = "The arguments to the program"; - }; - timeout = mkOption { - type = types.str; - default = defaults.extraction.unpaper.command.timeout; - description = "The timeout when executing the command"; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.extraction.unpaper.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.extraction.unpaper.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.extraction.unpaper.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.extraction.unpaper.command; + description = "The system command"; }; }; }); - default = defaults.extraction.unpaper.command; - description = "The system command"; + default = defaults.extraction.unpaper; + description = "The unpaper command."; }; - }; - }); - default = defaults.extraction.unpaper; - description = "The unpaper command."; - }; - tesseract = mkOption { - type = types.submodule({ - options = { - command = mkOption { + tesseract = mkOption { type = types.submodule({ options = { - program = mkOption { - type = types.str; - default = defaults.extraction.tesseract.command.program; - description = "The path to the executable."; - }; - args = mkOption { - type = types.listOf types.str; - default = defaults.extraction.tesseract.command.args; - description = "The arguments to the program"; - }; - timeout = mkOption { - type = types.str; - default = defaults.extraction.tesseract.command.timeout; - description = "The timeout when executing the command"; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.extraction.tesseract.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.extraction.tesseract.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.extraction.tesseract.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.extraction.tesseract.command; + description = "The system command"; }; }; }); - default = defaults.extraction.tesseract.command; - description = "The system command"; + default = defaults.extraction.tesseract; + description = "The tesseract command."; }; + }; }); - default = defaults.extraction.tesseract; - description = "The tesseract command."; + default = defaults.extraction.ocr; + description = ""; }; }; }); @@ -336,6 +416,182 @@ in { below. ''; }; + + convert = mkOption { + type = types.submodule({ + options = { + chunk-size = mkOption { + type = types.int; + default = defaults.convert.chunk-size; + description = '' + The chunk size used when storing files. This should be the same + as used with the rest server. + ''; + }; + max-image-size = mkOption { + type = types.int; + default = defaults.convert.max-image-size; + description = '' + When reading images, this is the maximum size. Images that are + larger are not processed. + ''; + }; + markdown = mkOption { + type = types.submodule({ + options = { + internal-css = mkOption { + type = types.str; + default = defaults.convert.markdown.internal-css; + description = '' + The CSS that is used to style the resulting HTML. + ''; + }; + }; + }); + default = defaults.convert.markdown; + description = '' + Settings when processing markdown files (and other text files) + to HTML. + + In order to support text formats, text files are first converted + to HTML using a markdown processor. The resulting HTML is then + converted to a PDF file. + ''; + }; + wkhtmlpdf = mkOption { + type = types.submodule({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.convert.wktmlpdf.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.wkhtmlpdf.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.wkhtmlpdf.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.wkhtmlpdf.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.wkhtmlpdf.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.wkhtmlpdf; + description = '' + To convert HTML files into PDF files, the external tool + wkhtmltopdf is used. + ''; + }; + tesseract = mkOption { + type = types.submodule({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.convert.tesseract.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.tesseract.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.tesseract.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.tesseract.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.tesseract.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.tesseract; + description = '' + To convert image files to PDF files, tesseract is used. This + also extracts the text in one go. + ''; + }; + unoconv = mkOption { + type = types.submodule({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.convert.unoconv.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.unoconv.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.unoconv.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.unoconv.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.unoconv.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.unoconv; + description = '' + To convert "office" files to PDF files, the external tool + unoconv is used. Unoconv uses libreoffice/openoffice for + converting. So it supports all formats that are possible to read + with libreoffice/openoffic. + + Note: to greatly improve performance, it is recommended to start + a libreoffice listener by running `unoconv -l` in a separate + process. + ''; + }; + }; + }); + default = defaults.convert; + description = '' + Configuration for converting files into PDFs. + + Most of it is delegated to external tools, which can be configured + below. They must be in the PATH environment or specify the full + path below via the `program` key. + ''; + }; }; }; diff --git a/nix/module-server.nix b/nix/module-server.nix index 7a160499..0ac4922b 100644 --- a/nix/module-server.nix +++ b/nix/module-server.nix @@ -34,7 +34,7 @@ let }; files = { chunk-size = 524288; - valid-mime-types = ["application/pdf"]; + valid-mime-types = []; }; }; }; diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 97bd76f5..3b6ceee1 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -11,6 +11,7 @@ object Dependencies { val DoobieVersion = "0.8.8" val EmilVersion = "0.2.0" val FastparseVersion = "2.1.3" + val FlexmarkVersion = "0.60.2" val FlywayVersion = "6.2.4" val Fs2Version = "2.2.2" val H2Version = "1.4.200" @@ -20,17 +21,60 @@ object Dependencies { val LogbackVersion = "1.2.3" val MariaDbVersion = "2.5.4" val MiniTestVersion = "2.7.0" + val PdfboxVersion = "2.0.18" + val PoiVersion = "4.1.1" val PostgresVersion = "42.2.10" val PureConfigVersion = "0.12.2" + val Slf4jVersion = "1.7.30" val SqliteVersion = "3.30.1" val StanfordNlpVersion = "3.9.2" val TikaVersion = "1.23" val YamuscaVersion = "0.6.1" val SwaggerUIVersion = "3.25.0" val SemanticUIVersion = "2.4.1" + val TwelveMonkeysVersion = "3.5" val JQueryVersion = "3.4.1" val ViewerJSVersion = "0.5.8" + + val jclOverSlf4j = Seq( + "org.slf4j" % "jcl-over-slf4j" % Slf4jVersion + ) + val julOverSlf4j = Seq( + "org.slf4j" % "jul-to-slf4j" % Slf4jVersion + ) + + val poi = Seq( + "org.apache.poi" % "poi" % PoiVersion, + "org.apache.poi" % "poi-ooxml" % PoiVersion, + "org.apache.poi" % "poi-scratchpad" % PoiVersion, + ).map(_.excludeAll( + ExclusionRule("commons-logging") + )) ++ jclOverSlf4j + + // https://github.com/vsch/flexmark-java + // BSD 2-Clause + val flexmark = Seq( + "com.vladsch.flexmark" % "flexmark" % FlexmarkVersion, + "com.vladsch.flexmark" % "flexmark-ext-tables" % FlexmarkVersion, + "com.vladsch.flexmark" % "flexmark-ext-gfm-strikethrough" % FlexmarkVersion + ).map(_.excludeAll( + ExclusionRule("junit"), + ExclusionRule("hamcrest-core") + )) + + val twelvemonkeys = Seq( + "com.twelvemonkeys.imageio" % "imageio-jpeg" % TwelveMonkeysVersion, + "com.twelvemonkeys.imageio" % "imageio-tiff" % TwelveMonkeysVersion + ) + + val pdfbox = Seq( + "org.apache.pdfbox" % "pdfbox" % PdfboxVersion excludeAll ( + ExclusionRule("org.bouncycastle"), + ExclusionRule("commons-logging") + ) + ) ++ jclOverSlf4j + val emil = Seq( "com.github.eikek" %% "emil-common" % EmilVersion, "com.github.eikek" %% "emil-javamail" % EmilVersion @@ -61,6 +105,12 @@ object Dependencies { val tika = Seq( "org.apache.tika" % "tika-core" % TikaVersion ) + val commonsIO = Seq( + "commons-io" % "commons-io" % "2.6" + ) + val tikaParser = Seq( + "org.apache.tika" % "tika-parsers" % TikaVersion + ) val bcrypt = Seq( "org.mindrot" % "jbcrypt" % BcryptVersion diff --git a/tools/ds.sh b/tools/ds.sh index de97158f..321c9f38 100755 --- a/tools/ds.sh +++ b/tools/ds.sh @@ -20,7 +20,6 @@ set -o errexit -o pipefail -o noclobber -o nounset CURL_CMD="curl" -FILE_CMD="file" GREP_CMD="grep" MKTEMP_CMD="mktemp" SHA256_CMD="sha256sum" @@ -44,7 +43,7 @@ fi # read getopt’s output this way to handle the quoting right: eval set -- "$PARSED" -delete=n skip=n help=n config="${XDG_CONFIG_HOME:-$HOME/.config}/docspell/ds.conf" +delete=n help=n config="${XDG_CONFIG_HOME:-$HOME/.config}/docspell/ds.conf" while true; do case "$1" in -h|--help) @@ -55,10 +54,6 @@ while true; do config="$2" shift 2 ;; - -s|--skip) - skip="y" - shift - ;; -d|--delete) delete="y" shift @@ -124,7 +119,6 @@ showUsage() { info "" info "Options:" info " -c | --config Provide a config file. (value: $config)" - info " -s | --skip Skip non-PDF files. Otherwise an error is raised. (value: $skip)" info " -d | --delete Delete the files when successfully uploaded (value: $delete)" info " -h | --help Prints this help text. (value: $help)" info "" @@ -133,16 +127,6 @@ showUsage() { info "" } -mimetype() { - $FILE_CMD -b --mime-type "$1" -} - -isPdf() { - mime=$(mimetype "$1") - [ "$mime" = "application/pdf" ] -} - - if [ "$help" = "y" ]; then showUsage exit 0 @@ -154,15 +138,6 @@ if [[ $# -eq 0 ]]; then exit 4 fi -if [ "$skip" = "n" ]; then - IFS=$'\n' - for file in $*; do - if ! isPdf "$file"; then - info "Not a PDF file: $file" - exit 5 - fi - done -fi ## Read the config file declare -a urls @@ -177,18 +152,14 @@ done <<< $($GREP_CMD -v '^#.*' "$config") ## Main IFS=$'\n' for file in $*; do - if isPdf "$file"; then - for url in "${urls[@]}"; do - info "Uploading '$file' to '$url'" - set +e - upload "$file" "$url" - set -e - if [ "$delete" = "y" ] && [ $? -eq 0 ]; then - info "Deleting file: $file" - rm -f "$file" - fi - done - else - info "Skipping non-PDF file: $file" - fi + for url in "${urls[@]}"; do + info "Uploading '$file' to '$url'" + set +e + upload "$file" "$url" + set -e + if [ "$delete" = "y" ] && [ $? -eq 0 ]; then + info "Deleting file: $file" + rm -f "$file" + fi + done done