mirror of
				https://github.com/TheAnachronism/docspell.git
				synced 2025-10-31 17:50:11 +00:00 
			
		
		
		
	| @@ -53,6 +53,9 @@ object Duration { | ||||
|   def days(n: Long): Duration = | ||||
|     apply(JDur.ofDays(n)) | ||||
|  | ||||
|   def years(n: Long): Duration = | ||||
|     days(n * 365) | ||||
|  | ||||
|   def nanos(n: Long): Duration = | ||||
|     Duration(n) | ||||
|  | ||||
|   | ||||
| @@ -68,7 +68,7 @@ docspell.joex { | ||||
|     # How often a failed job should be retried until it enters failed | ||||
|     # state. If a job fails, it becomes "stuck" and will be retried | ||||
|     # after a delay. | ||||
|     retries = 5 | ||||
|     retries = 2 | ||||
|  | ||||
|     # The delay until the next try is performed for a failed job. This | ||||
|     # delay is increased exponentially with the number of retries. | ||||
| @@ -341,6 +341,13 @@ docspell.joex { | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   # General config for processing documents | ||||
|   processing { | ||||
|     # Restricts proposals for due dates. Only dates earlier than this | ||||
|     # number of years in the future are considered. | ||||
|     max-due-date-years = 10 | ||||
|   } | ||||
|  | ||||
|   # The same section is also present in the rest-server config. It is | ||||
|   # used when submitting files into the job queue for processing. | ||||
|   # | ||||
|   | ||||
| @@ -25,7 +25,8 @@ case class Config( | ||||
|     sendMail: MailSendConfig, | ||||
|     files: Files, | ||||
|     mailDebug: Boolean, | ||||
|     fullTextSearch: Config.FullTextSearch | ||||
|     fullTextSearch: Config.FullTextSearch, | ||||
|     processing: Config.Processing | ||||
| ) | ||||
|  | ||||
| object Config { | ||||
| @@ -47,4 +48,6 @@ object Config { | ||||
|  | ||||
|     final case class Migration(indexAllChunk: Int) | ||||
|   } | ||||
|  | ||||
|   case class Processing(maxDueDateYears: Int) | ||||
| } | ||||
|   | ||||
| @@ -8,6 +8,7 @@ import cats.effect.Sync | ||||
| import docspell.analysis.contact._ | ||||
| import docspell.common.MetaProposal.Candidate | ||||
| import docspell.common._ | ||||
| import docspell.joex.Config | ||||
| import docspell.joex.scheduler.{Context, Task} | ||||
| import docspell.store.records._ | ||||
|  | ||||
| @@ -16,33 +17,42 @@ import docspell.store.records._ | ||||
|   */ | ||||
| object FindProposal { | ||||
|  | ||||
|   def apply[F[_]: Sync](data: ItemData): Task[F, ProcessItemArgs, ItemData] = | ||||
|   def apply[F[_]: Sync]( | ||||
|       cfg: Config.Processing | ||||
|   )(data: ItemData): Task[F, ProcessItemArgs, ItemData] = | ||||
|     Task { ctx => | ||||
|       val rmas = data.metas.map(rm => rm.copy(nerlabels = removeDuplicates(rm.nerlabels))) | ||||
|  | ||||
|       ctx.logger.info("Starting find-proposal") *> | ||||
|         rmas | ||||
|           .traverse(rm => | ||||
|             processAttachment(rm, data.findDates(rm), ctx) | ||||
|             processAttachment(cfg, rm, data.findDates(rm), ctx) | ||||
|               .map(ml => rm.copy(proposals = ml)) | ||||
|           ) | ||||
|           .map(rmv => data.copy(metas = rmv)) | ||||
|     } | ||||
|  | ||||
|   def processAttachment[F[_]: Sync]( | ||||
|       cfg: Config.Processing, | ||||
|       rm: RAttachmentMeta, | ||||
|       rd: Vector[NerDateLabel], | ||||
|       ctx: Context[F, ProcessItemArgs] | ||||
|   ): F[MetaProposalList] = { | ||||
|     val finder = Finder.searchExact(ctx).next(Finder.searchFuzzy(ctx)) | ||||
|     List(finder.find(rm.nerlabels), makeDateProposal(rd)) | ||||
|     List(finder.find(rm.nerlabels), makeDateProposal(cfg, rd)) | ||||
|       .traverse(identity) | ||||
|       .map(MetaProposalList.flatten) | ||||
|   } | ||||
|  | ||||
|   def makeDateProposal[F[_]: Sync](dates: Vector[NerDateLabel]): F[MetaProposalList] = | ||||
|   def makeDateProposal[F[_]: Sync]( | ||||
|       cfg: Config.Processing, | ||||
|       dates: Vector[NerDateLabel] | ||||
|   ): F[MetaProposalList] = | ||||
|     Timestamp.current[F].map { now => | ||||
|       val latestFirst     = dates.sortWith((l1, l2) => l1.date.isAfter(l2.date)) | ||||
|       val maxFuture = now.plus(Duration.years(cfg.maxDueDateYears.toLong)) | ||||
|       val latestFirst = dates | ||||
|         .filter(_.date.isBefore(maxFuture.toUtcDate)) | ||||
|         .sortWith((l1, l2) => l1.date.isAfter(l2.date)) | ||||
|       val nowDate         = now.value.atZone(ZoneId.of("GMT")).toLocalDate | ||||
|       val (after, before) = latestFirst.span(ndl => ndl.date.isAfter(nowDate)) | ||||
|  | ||||
|   | ||||
| @@ -2,7 +2,6 @@ package docspell.joex.process | ||||
|  | ||||
| import cats.effect._ | ||||
| import docspell.common.ProcessItemArgs | ||||
| import docspell.analysis.TextAnalysisConfig | ||||
| import docspell.joex.scheduler.Task | ||||
| import docspell.joex.Config | ||||
| import docspell.ftsclient.FtsClient | ||||
| @@ -19,16 +18,16 @@ object ProcessItem { | ||||
|       .flatMap(Task.setProgress(40)) | ||||
|       .flatMap(TextExtraction(cfg.extraction, fts)) | ||||
|       .flatMap(Task.setProgress(60)) | ||||
|       .flatMap(analysisOnly[F](cfg.textAnalysis)) | ||||
|       .flatMap(analysisOnly[F](cfg)) | ||||
|       .flatMap(Task.setProgress(80)) | ||||
|       .flatMap(LinkProposal[F]) | ||||
|       .flatMap(Task.setProgress(99)) | ||||
|  | ||||
|   def analysisOnly[F[_]: Sync]( | ||||
|       cfg: TextAnalysisConfig | ||||
|       cfg: Config | ||||
|   )(item: ItemData): Task[F, ProcessItemArgs, ItemData] = | ||||
|     TextAnalysis[F](cfg)(item) | ||||
|       .flatMap(FindProposal[F]) | ||||
|     TextAnalysis[F](cfg.textAnalysis)(item) | ||||
|       .flatMap(FindProposal[F](cfg.processing)) | ||||
|       .flatMap(EvalProposals[F]) | ||||
|       .flatMap(SaveProposals[F]) | ||||
|  | ||||
|   | ||||
| @@ -0,0 +1,5 @@ | ||||
| ALTER TABLE `item` | ||||
| MODIFY `itemdate` DATETIME NULL; | ||||
|  | ||||
| ALTER TABLE `item` | ||||
| MODIFY `duedate` DATETIME NULL; | ||||
| @@ -35,7 +35,7 @@ let | ||||
|     scheduler = { | ||||
|       pool-size = 2; | ||||
|       counting-scheme = "4,1"; | ||||
|       retries = 5; | ||||
|       retries = 2; | ||||
|       retry-delay = "1 minute"; | ||||
|       log-buffer-size = 500; | ||||
|       wakeup-period = "30 minutes"; | ||||
| @@ -92,6 +92,9 @@ let | ||||
|     text-analysis = { | ||||
|       max-length = 10000; | ||||
|     }; | ||||
|     processing = { | ||||
|       max-due-date-years = 10; | ||||
|     }; | ||||
|     convert = { | ||||
|       chunk-size = 524288; | ||||
|       max-image-size = 14000000; | ||||
| @@ -133,6 +136,19 @@ let | ||||
|       chunk-size = 524288; | ||||
|       valid-mime-types = []; | ||||
|     }; | ||||
|     full-text-search = { | ||||
|       enabled = false; | ||||
|       solr = { | ||||
|         url = "http://localhost:8983/solr/docspell"; | ||||
|         commit-within = 1000; | ||||
|         log-verbose = false; | ||||
|         def-type = "lucene"; | ||||
|         q-op = "OR"; | ||||
|       }; | ||||
|       migration = { | ||||
|         index-all-chunk = 10; | ||||
|       }; | ||||
|     }; | ||||
|   }; | ||||
| in { | ||||
|  | ||||
| @@ -653,6 +669,23 @@ in { | ||||
|         description = "Settings for text analysis"; | ||||
|       }; | ||||
|  | ||||
|       processing = mkOption { | ||||
|         type = types.submodule({ | ||||
|           options = { | ||||
|             max-due-date-years = mkOption { | ||||
|               type = types.int; | ||||
|               default = defaults.processing.max-due-date-years; | ||||
|               description = '' | ||||
|                 Restricts proposals for due dates. Only dates earlier than this | ||||
|                 number of years in the future are considered. | ||||
|               ''; | ||||
|             }; | ||||
|           }; | ||||
|         }); | ||||
|         default = defaults.processing; | ||||
|         description = "General config for processing documents"; | ||||
|       }; | ||||
|  | ||||
|       convert = mkOption { | ||||
|         type = types.submodule({ | ||||
|           options = { | ||||
| @@ -860,6 +893,79 @@ in { | ||||
|         default = defaults.files; | ||||
|         description= "Settings for how files are stored."; | ||||
|       }; | ||||
|       full-text-search = mkOption { | ||||
|         type = types.submodule({ | ||||
|           options = { | ||||
|             enabled = mkOption { | ||||
|               type = types.bool; | ||||
|               default = defaults.full-text-search.enabled; | ||||
|               description = '' | ||||
|                 The full-text search feature can be disabled. It requires an | ||||
|                 additional index server which needs additional memory and disk | ||||
|                 space. It can be enabled later any time. | ||||
|  | ||||
|                 Currently the SOLR search platform is supported. | ||||
|               ''; | ||||
|             }; | ||||
|             solr = mkOption { | ||||
|               type = types.submodule({ | ||||
|                 options = { | ||||
|                   url = mkOption { | ||||
|                     type = types.str; | ||||
|                     default = defaults.full-text-search.solr.url; | ||||
|                     description = "The URL to solr"; | ||||
|                   }; | ||||
|                   commit-within = mkOption { | ||||
|                     type = types.int; | ||||
|                     default = defaults.full-text-search.solr.commit-within; | ||||
|                     description = "Used to tell solr when to commit the data"; | ||||
|                   }; | ||||
|                   log-verbose = mkOption { | ||||
|                     type = types.bool; | ||||
|                     default = defaults.full-text-search.solr.log-verbose; | ||||
|                     description = "If true, logs request and response bodies"; | ||||
|                   }; | ||||
|                   def-type = mkOption { | ||||
|                     type = types.str; | ||||
|                     default = defaults.full-text-search.solr.def-type; | ||||
|                     description = '' | ||||
|                       The defType parameter to lucene that defines the parser to | ||||
|                       use. You might want to try "edismax" or look here: | ||||
|                       https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing | ||||
|                     ''; | ||||
|                   }; | ||||
|                   q-op = mkOption { | ||||
|                     type = types.str; | ||||
|                     default = defaults.full-text-search.solr.q-op; | ||||
|                     description = "The default combiner for tokens. One of {AND, OR}."; | ||||
|                   }; | ||||
|                 }; | ||||
|               }); | ||||
|               default = defaults.full-text-search.solr; | ||||
|               description = "Configuration for the SOLR backend."; | ||||
|             }; | ||||
|             migration = mkOption { | ||||
|               type = types.submodule({ | ||||
|                 options = { | ||||
|                   index-all-chunk = mkOption { | ||||
|                     type = types.int; | ||||
|                     default = defaults.full-text-search.migration.index-all-chunk; | ||||
|                     description = '' | ||||
|                       Chunk size to use when indexing data from the database. This | ||||
|                       many attachments are loaded into memory and pushed to the | ||||
|                       full-text index. | ||||
|                     ''; | ||||
|                   }; | ||||
|                 }; | ||||
|               }); | ||||
|               default = defaults.full-text-search.migration; | ||||
|               description = "Settings for running the index migration tasks"; | ||||
|             }; | ||||
|           }; | ||||
|         }); | ||||
|         default = defaults.full-text-search; | ||||
|         description = "Configuration for full-text search."; | ||||
|       }; | ||||
|     }; | ||||
|   }; | ||||
|  | ||||
|   | ||||
| @@ -37,6 +37,17 @@ let | ||||
|         header-value = "some-secret"; | ||||
|       }; | ||||
|     }; | ||||
|     full-text-search = { | ||||
|       enabled = false; | ||||
|       solr = { | ||||
|         url = "http://localhost:8983/solr/docspell"; | ||||
|         commit-within = 1000; | ||||
|         log-verbose = false; | ||||
|         def-type = "lucene"; | ||||
|         q-op = "OR"; | ||||
|       }; | ||||
|       recreate-key = ""; | ||||
|     }; | ||||
|     auth = { | ||||
|       server-secret = "hex:caffee"; | ||||
|       session-valid = "5 minutes"; | ||||
| @@ -271,6 +282,75 @@ in { | ||||
|         ''; | ||||
|       }; | ||||
|  | ||||
|       full-text-search = mkOption { | ||||
|         type = types.submodule({ | ||||
|           options = { | ||||
|             enabled = mkOption { | ||||
|               type = types.bool; | ||||
|               default = defaults.full-text-search.enabled; | ||||
|               description = '' | ||||
|                 The full-text search feature can be disabled. It requires an | ||||
|                 additional index server which needs additional memory and disk | ||||
|                 space. It can be enabled later any time. | ||||
|  | ||||
|                 Currently the SOLR search platform is supported. | ||||
|               ''; | ||||
|             }; | ||||
|             solr = mkOption { | ||||
|               type = types.submodule({ | ||||
|                 options = { | ||||
|                   url = mkOption { | ||||
|                     type = types.str; | ||||
|                     default = defaults.full-text-search.solr.url; | ||||
|                     description = "The URL to solr"; | ||||
|                   }; | ||||
|                   commit-within = mkOption { | ||||
|                     type = types.int; | ||||
|                     default = defaults.full-text-search.solr.commit-within; | ||||
|                     description = "Used to tell solr when to commit the data"; | ||||
|                   }; | ||||
|                   log-verbose = mkOption { | ||||
|                     type = types.bool; | ||||
|                     default = defaults.full-text-search.solr.log-verbose; | ||||
|                     description = "If true, logs request and response bodies"; | ||||
|                   }; | ||||
|                   def-type = mkOption { | ||||
|                     type = types.str; | ||||
|                     default = defaults.full-text-search.solr.def-type; | ||||
|                     description = '' | ||||
|                       The defType parameter to lucene that defines the parser to | ||||
|                       use. You might want to try "edismax" or look here: | ||||
|                       https://lucene.apache.org/solr/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing | ||||
|                     ''; | ||||
|                   }; | ||||
|                   q-op = mkOption { | ||||
|                     type = types.str; | ||||
|                     default = defaults.full-text-search.solr.q-op; | ||||
|                     description = "The default combiner for tokens. One of {AND, OR}."; | ||||
|                   }; | ||||
|                 }; | ||||
|               }); | ||||
|               default = defaults.full-text-search.solr; | ||||
|               description = "Configuration for the SOLR backend."; | ||||
|             }; | ||||
|             recreate-key = mkOption { | ||||
|               type = types.str; | ||||
|               default = defaults.full-text-search.recreate-key; | ||||
|               description = '' | ||||
|                 When re-creating the complete index via a REST call, this key | ||||
|                 is required. If left empty (the default), recreating the index | ||||
|                 is disabled. | ||||
|  | ||||
|                 Example curl command: | ||||
|                 curl -XPOST http://localhost:7880/api/v1/open/fts/reIndexAll/test123 | ||||
|               ''; | ||||
|             }; | ||||
|           }; | ||||
|         }); | ||||
|         default = defaults.full-text-search; | ||||
|         description = "Configuration for full-text search."; | ||||
|       }; | ||||
|  | ||||
|       backend = mkOption { | ||||
|         type = types.submodule({ | ||||
|           options = { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user