diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45b48147..d8a25793 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: working-directory: modules/webapp - name: Fetch tags run: git fetch --depth=100 origin +refs/tags/*:refs/tags/* - - uses: olafurpg/setup-scala@v12 + - uses: olafurpg/setup-scala@v13 with: java-version: ${{ matrix.java }} - name: Coursier cache diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index 624dd949..e5f0f671 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v2.3.4 with: fetch-depth: 0 - - uses: olafurpg/setup-scala@v12 + - uses: olafurpg/setup-scala@v13 with: java-version: ${{ matrix.java }} - uses: jorelali/setup-elm@v3 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d606b396..ba3ff89c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v2.3.4 with: fetch-depth: 0 - - uses: olafurpg/setup-scala@v12 + - uses: olafurpg/setup-scala@v13 with: java-version: ${{ matrix.java }} - uses: jorelali/setup-elm@v3 diff --git a/Changelog.md b/Changelog.md index 1f628076..9d374f39 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,15 +1,58 @@ # Changelog +## v0.25.1 + +*Jul 29, 2021* + +- Fix solr fulltext search by adding the new japanese content field + +The SOLR fulltext search is broken in 0.25.0, so this is a fixup +release. + ## v0.25.0 -*Unreleased* +*Jul 29, 2021* + +- Introducing a new CLI tool (#345) that replaces all the shell + scripts from the `tools/` directory! https://github.com/docspell/dsc +- UI changes: + - year separators are now more prominent (#950) + - fixes a bug in the item counter in detail view when an item is + deleted (#920) + - German translation improvements (#901) + - The number of selected files is shown in upload page (#896) +- The created date of an item can now be used in queries (#925, #958) +- Setting tags api has been improved (#955) +- Task for converting pdfs is now behind the admin secret (#949) +- Task for generating preview images is now behind the admin secret (#915) +- respond with 404 when the source-id is not correct (#931) +- Update of core libraries (#890) +- Add Japanese to the list of document languages. Thanks @wallace11 + for helping out (#948, #962) +- Fix setting the folder from metadata when processing a file and + allow to specifiy it by name or id (#940) +- Fixes docspell config file in docker-compose setup (#909) +- Fixes selecting the next job in the job executor (#898) +- Fixes a bug that prevents uploading more than one file at once + (#938) ### Rest API Changes +- Removed `sec/item/convertallpdfs` endpoint in favor for new + `admin/attachments/convertallpdfs` endpoint which is now an admin + task - Removed `sec/collective/previews` endpoint, in favor for new `admin/attachments/generatePreviews` endpoint which is now an admin task to generate previews for all files. The now removed enpoint did this only for one collective. +- `/sec/item/{id}/tags`: Setting tags to an item (replacing existing + tags) has been changed to allow tags to be specified as names or ids +- `/sec/item/{id}/tagsremove`: Added a route to remove tags for a + single item + +### Configuration Changes + +None. ## v0.24.0 diff --git a/README.md b/README.md index 45fa9f89..633882c5 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,8 @@ fulltext search and has great e-mail integration. Everything is accessible via a REST/HTTP api. A mobile friendly SPA web application is the default user interface. An [Android app](https://github.com/docspell/android-client) exists for -conveniently uploading files from your phone/tablet. The [feature +conveniently uploading files from your phone/tablet and a +[cli](https://github.com/docspell/dsc). The [feature overview](https://docspell.org/#feature-selection) lists some more points. diff --git a/build.sbt b/build.sbt index 849d7a71..3eea7c5e 100644 --- a/build.sbt +++ b/build.sbt @@ -88,8 +88,8 @@ val elmSettings = Seq( Compile / unmanagedSourceDirectories += (Compile / sourceDirectory).value / "elm", headerSources / includeFilter := "*.elm", headerMappings := headerMappings.value + (HeaderFileType("elm") -> HeaderCommentStyle( - new CommentBlockCreator("{-", " ", "-}"), - HeaderPattern.commentBetween("\\{\\-", " ", "\\-\\}") + new CommentBlockCreator("{-", " ", "-}\n"), + HeaderPattern.commentBetween("\\{\\-", " ", "\\-\\}") )) ) val stylesSettings = Seq( diff --git a/docker/docker-compose/docker-compose.yml b/docker/docker-compose/docker-compose.yml index 25a6139b..221a06c3 100644 --- a/docker/docker-compose/docker-compose.yml +++ b/docker/docker-compose/docker-compose.yml @@ -30,16 +30,18 @@ services: - solr consumedir: - image: docspell/tools:latest + image: docspell/dsc:latest container_name: docspell-consumedir command: - - ds-consumedir - - "-vmdi" - - "--path" - - "/opt/docs" - - "--iheader" + - dsc + - "-d" + - "http://docspell-restserver:7880" + - "watch" + - "--delete" + - "-ir" + - "--header" - "Docspell-Integration:$DOCSPELL_HEADER_VALUE" - - "http://docspell-restserver:7880/api/v1/open/integration/item" + - "/opt/docs" restart: unless-stopped env_file: ./.env volumes: diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index 130f7c30..a75de16c 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -29,6 +29,7 @@ RUN JDKPKG="openjdk11"; \ tesseract-ocr-data-rus \ tesseract-ocr-data-ron \ tesseract-ocr-data-lav \ + tesseract-ocr-data-jpn \ unpaper \ wkhtmltopdf \ libreoffice \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 038dba08..4d90324e 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -21,9 +21,7 @@ import docspell.common._ object DateFind { def findDates(text: String, lang: Language): Stream[Pure, NerDateLabel] = - TextSplitter - .splitToken(text, " \t.,\n\r/".toSet) - .filter(w => lang != Language.Latvian || w.value != "gada") + splitWords(text, lang) .sliding(3) .filter(_.size == 3) .flatMap(q => @@ -44,6 +42,20 @@ object DateFind { ) ) + private[this] val jpnChars = + ("年月日" + MonthName.getAll(Language.Japanese).map(_.mkString).mkString).toSet + + private def splitWords(text: String, lang: Language): Stream[Pure, Word] = { + val stext = + if (lang == Language.Japanese) { + text.map(c => if (jpnChars.contains(c)) c else ' ') + } else text + + TextSplitter + .splitToken(stext, " \t.,\n\r/年月日".toSet) + .filter(w => lang != Language.Latvian || w.value != "gada") + } + case class SimpleDate(year: Int, month: Int, day: Int) { def toLocalDate: LocalDate = LocalDate.of(if (year < 100) 2000 + year else year, month, day) @@ -89,6 +101,7 @@ object DateFind { case Language.Swedish => ymd.or(dmy).or(mdy) case Language.Dutch => dmy.or(ymd).or(mdy) case Language.Latvian => dmy.or(lavLong).or(ymd) + case Language.Japanese => ymd } p.read(parts) match { case Result.Success(sds, _) => diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index 8a5852d0..37b16852 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -50,6 +50,8 @@ object MonthName { russian case Language.Latvian => latvian + case Language.Japanese => + japanese } private val numbers = List( @@ -290,4 +292,19 @@ object MonthName { List("novembris", "nov."), List("decembris", "dec.") ) + + private val japanese = List( + List("1", "一"), + List("2", "二"), + List("3", "三"), + List("4", "四"), + List("5", "五"), + List("6", "六"), + List("7", "七"), + List("8", "八"), + List("9", "九"), + List("10", "十"), + List("11", "十一"), + List("12", "十二") + ) } diff --git a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala index 7784fd2f..a41eb6d3 100644 --- a/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala +++ b/modules/analysis/src/test/scala/docspell/analysis/date/DateFindSpec.scala @@ -143,4 +143,40 @@ class DateFindSpec extends FunSuite { ) } + test("find japanese dates") { + assertEquals( + DateFind + .findDates("今日の日付は2021.7.21です。", Language.Japanese) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 7, 21), + NerLabel("2021.7.21", NerTag.Date, 6, 15) + ) + ) + ) + assertEquals( + DateFind + .findDates("今日の日付は2021年7月21日です。", Language.Japanese) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 7, 21), + NerLabel("2021年7月21", NerTag.Date, 6, 15) + ) + ) + ) + assertEquals( + DateFind + .findDates("年月日2021年7月21日(日)", Language.Japanese) + .toVector, + Vector( + NerDateLabel( + LocalDate.of(2021, 7, 21), + NerLabel("2021年7月21", NerTag.Date, 3, 12) + ) + ) + ) + } + } diff --git a/modules/backend/src/main/scala/docspell/backend/JobFactory.scala b/modules/backend/src/main/scala/docspell/backend/JobFactory.scala index e1d11a9d..2c1cd242 100644 --- a/modules/backend/src/main/scala/docspell/backend/JobFactory.scala +++ b/modules/backend/src/main/scala/docspell/backend/JobFactory.scala @@ -68,14 +68,14 @@ object JobFactory { args, "Create preview images", now, - submitter.getOrElse(DocspellSystem.taskGroup), + submitter.getOrElse(DocspellSystem.user), Priority.Low, Some(DocspellSystem.allPreviewTaskTracker) ) def convertAllPdfs[F[_]: Sync]( collective: Option[Ident], - account: AccountId, + submitter: Option[Ident], prio: Priority ): F[RJob] = for { @@ -84,11 +84,11 @@ object JobFactory { job = RJob.newJob( id, ConvertAllPdfArgs.taskName, - account.collective, + collective.getOrElse(DocspellSystem.taskGroup), ConvertAllPdfArgs(collective), s"Convert all pdfs not yet converted", now, - account.user, + submitter.getOrElse(DocspellSystem.user), prio, collective .map(c => c / ConvertAllPdfArgs.taskName) diff --git a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala index 135162da..d9826904 100644 --- a/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala +++ b/modules/backend/src/main/scala/docspell/backend/ops/OItem.scala @@ -24,7 +24,7 @@ import org.log4s.getLogger trait OItem[F[_]] { /** Sets the given tags (removing all existing ones). */ - def setTags(item: Ident, tagIds: List[Ident], collective: Ident): F[UpdateResult] + def setTags(item: Ident, tagIds: List[String], collective: Ident): F[UpdateResult] /** Sets tags for multiple items. The tags of the items will be * replaced with the given ones. Same as `setTags` but for multiple @@ -32,7 +32,7 @@ trait OItem[F[_]] { */ def setTagsMultipleItems( items: NonEmptyList[Ident], - tags: List[Ident], + tags: List[String], collective: Ident ): F[UpdateResult] @@ -181,7 +181,7 @@ trait OItem[F[_]] { */ def convertAllPdf( collective: Option[Ident], - account: AccountId, + submitter: Option[Ident], notifyJoex: Boolean ): F[UpdateResult] @@ -304,19 +304,20 @@ object OItem { def setTags( item: Ident, - tagIds: List[Ident], + tagIds: List[String], collective: Ident ): F[UpdateResult] = setTagsMultipleItems(NonEmptyList.of(item), tagIds, collective) def setTagsMultipleItems( items: NonEmptyList[Ident], - tags: List[Ident], + tags: List[String], collective: Ident ): F[UpdateResult] = UpdateResult.fromUpdate(store.transact(for { - k <- RTagItem.deleteItemTags(items, collective) - res <- items.traverse(i => RTagItem.setAllTags(i, tags)) + k <- RTagItem.deleteItemTags(items, collective) + rtags <- RTag.findAllByNameOrId(tags, collective) + res <- items.traverse(i => RTagItem.setAllTags(i, rtags.map(_.tagId))) n = res.fold } yield k + n)) @@ -687,11 +688,11 @@ object OItem { def convertAllPdf( collective: Option[Ident], - account: AccountId, + submitter: Option[Ident], notifyJoex: Boolean ): F[UpdateResult] = for { - job <- JobFactory.convertAllPdfs[F](collective, account, Priority.Low) + job <- JobFactory.convertAllPdfs[F](collective, submitter, Priority.Low) _ <- queue.insertIfNew(job) _ <- if (notifyJoex) joex.notifyAllNodes else ().pure[F] } yield UpdateResult.success diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index a3e012fa..d46aba3a 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -108,6 +108,11 @@ object Language { val iso3 = "lav" } + case object Japanese extends Language { + val iso2 = "ja" + val iso3 = "jpn" + } + val all: List[Language] = List( German, @@ -124,7 +129,8 @@ object Language { Swedish, Russian, Romanian, - Latvian + Latvian, + Japanese ) def fromString(str: String): Either[String, Language] = { diff --git a/modules/extract/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java b/modules/extract/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java new file mode 100644 index 00000000..2b88a075 --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.exception; + +import org.xml.sax.SAXException; + +public class WriteLimitReachedException extends SAXException { + + //in case of (hopefully impossible) cyclic exception + private final static int MAX_DEPTH = 100; + + private final int writeLimit; + public WriteLimitReachedException(int writeLimit) { + this.writeLimit = writeLimit; + } + + @Override + public String getMessage() { + return "Your document contained more than " + writeLimit + + " characters, and so your requested limit has been" + + " reached. To receive the full text of the document," + + " increase your limit. (Text up to the limit is" + + " however available)."; + } + /** + * Checks whether the given exception (or any of it's root causes) was + * thrown by this handler as a signal of reaching the write limit. + * + * @param t throwable + * @return true if the write limit was reached, + * false otherwise + * @since Apache Tika 2.0 + */ + public static boolean isWriteLimitReached(Throwable t) { + return isWriteLimitReached(t, 0); + } + + private static boolean isWriteLimitReached(Throwable t, int depth) { + if (t == null) { + return false; + } + if (depth > MAX_DEPTH) { + return false; + } + if (t instanceof WriteLimitReachedException) { + return true; + } else { + return t.getCause() != null && isWriteLimitReached(t.getCause(), depth + 1); + } + } + + public static void throwIfWriteLimitReached(Exception ex) throws SAXException { + throwIfWriteLimitReached(ex, 0); + } + + private static void throwIfWriteLimitReached(Exception ex, int depth) throws SAXException { + if (ex == null) { + return; + } + if (depth > MAX_DEPTH) { + return; + } + if (ex instanceof WriteLimitReachedException) { + throw (SAXException) ex; + } else { + isWriteLimitReached(ex.getCause(), depth + 1); + } + } +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java new file mode 100644 index 00000000..416be0fc --- /dev/null +++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.ContentHandlerDecorator; +import org.apache.tika.utils.XMLReaderUtils; + +/** + * Handler for macros in flat open documents + */ +class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator { + + static String MODULE = "module"; + static String NAME = "name"; + private static String SOURCE_CODE = "source-code"; + private final ContentHandler contentHandler; + private final ParseContext parseContext; + private final StringBuilder macroBuffer = new StringBuilder(); + String macroName = null; + boolean inMacro = false; + private EmbeddedDocumentExtractor embeddedDocumentExtractor; + + FlatOpenDocumentMacroHandler(ContentHandler contentHandler, ParseContext parseContext) { + super(contentHandler); + this.contentHandler = contentHandler; + this.parseContext = parseContext; + } + + @Override + public void startElement(String namespaceURI, String localName, String qName, Attributes attrs) + throws SAXException { + if (MODULE.equals(localName)) { + macroName = XMLReaderUtils.getAttrValue(NAME, attrs); + } else if (SOURCE_CODE.equals(localName)) { + inMacro = true; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (inMacro) { + macroBuffer.append(ch, start, length); + } + } + + @Override + public void endElement(String namespaceURI, String localName, String qName) + throws SAXException { + if (SOURCE_CODE.equals(localName)) { + try { + handleMacro(); + } catch (IOException e) { + throw new SAXException(e); + } finally { + resetMacroState(); + } + } + } + + protected void resetMacroState() { + macroBuffer.setLength(0); + macroName = null; + inMacro = false; + } + + protected void handleMacro() throws IOException, SAXException { + + byte[] bytes = macroBuffer.toString().getBytes(StandardCharsets.UTF_8); + + if (embeddedDocumentExtractor == null) { + embeddedDocumentExtractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); + } + Metadata embeddedMetadata = new Metadata(); + if (!isBlank(macroName)) { + embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, macroName); + } + embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + + if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { + try (InputStream is = TikaInputStream.get(bytes)) { + embeddedDocumentExtractor + .parseEmbedded(is, contentHandler, embeddedMetadata, false); + } + } + } + + private static boolean isBlank(String s) { + return s == null || s.trim().isEmpty(); + } +} diff --git a/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java b/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java index 80b2301c..2193e83b 100644 --- a/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java +++ b/modules/extract/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java @@ -1,31 +1,32 @@ /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.odf; + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.tika.parser.odf; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Locale; -import org.apache.tika.sax.ContentHandlerDecorator; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import java.io.IOException; -import java.io.StringReader; -import java.util.Locale; +import org.apache.tika.sax.ContentHandlerDecorator; /** * Content handler decorator that: