|
@@ -1,15 +1,13 @@
|
|
|
package com.simuwang.daq.components.report.parser.pdf;
|
|
|
|
|
|
-import cn.hutool.core.collection.CollUtil;
|
|
|
import cn.hutool.core.collection.ListUtil;
|
|
|
+import cn.hutool.core.map.MapUtil;
|
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
import com.simuwang.base.common.conts.Constants;
|
|
|
+import com.simuwang.base.common.enums.ReportType;
|
|
|
import com.simuwang.base.common.exception.APIException;
|
|
|
import com.simuwang.base.mapper.EmailFieldMappingMapper;
|
|
|
-import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
|
|
|
-import com.simuwang.base.pojo.dto.report.ReportData;
|
|
|
-import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
|
|
|
-import com.simuwang.base.pojo.dto.report.ReportParserParams;
|
|
|
+import com.simuwang.base.pojo.dto.report.*;
|
|
|
import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
|
import com.simuwang.daq.components.report.parser.AbstractReportParser;
|
|
|
import org.apache.pdfbox.Loader;
|
|
@@ -24,6 +22,9 @@ import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
import java.io.IOException;
|
|
|
import java.util.Calendar;
|
|
|
import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
+import java.util.Objects;
|
|
|
+import java.util.function.Function;
|
|
|
import java.util.regex.Matcher;
|
|
|
import java.util.regex.Pattern;
|
|
|
import java.util.stream.Collectors;
|
|
@@ -35,6 +36,10 @@ import java.util.stream.Collectors;
|
|
|
*/
|
|
|
public abstract class AbstractPDReportParser<T extends ReportData> extends AbstractReportParser<T> {
|
|
|
/**
|
|
|
+ * 基金信息表格
|
|
|
+ */
|
|
|
+ protected Table fundInfoTable;
|
|
|
+ /**
|
|
|
* 去除了水印的所有文本内容
|
|
|
*/
|
|
|
protected List<String> textList;
|
|
@@ -45,22 +50,17 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
|
|
|
@Override
|
|
|
public T parse(ReportParserParams params) throws IOException {
|
|
|
+ // 初始化
|
|
|
this.init();
|
|
|
- // 解析报告名称和表格
|
|
|
- String reportName = null;
|
|
|
+ // 解析报告和表格
|
|
|
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
|
|
|
+ // 识别所有文字(去水印后的)
|
|
|
CustomPDFTextStripper stripper = new CustomPDFTextStripper();
|
|
|
stripper.setSortByPosition(true);
|
|
|
- String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, System.lineSeparator());
|
|
|
+ String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, StrUtil.EMPTY);
|
|
|
this.textList = StrUtil.split(text, System.lineSeparator());
|
|
|
this.textList.removeIf(StrUtil::isBlank);
|
|
|
- if (CollUtil.isNotEmpty(this.textList)) {
|
|
|
- reportName = this.matchReportName(this.textList.get(0));
|
|
|
- if (StrUtil.isBlank(reportName)) {
|
|
|
- throw new APIException("未匹配到报告名称");
|
|
|
- }
|
|
|
- }
|
|
|
- // 解析所有表格
|
|
|
+ // 解析所有表格(单元格字符去水印)
|
|
|
List<Table> tables = ListUtil.list(true);
|
|
|
SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
|
// 自定义表格提取工具,去除单元格中的水印文字
|
|
@@ -71,20 +71,55 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
}
|
|
|
this.initTableInfo(tables);
|
|
|
}
|
|
|
+ // 报告基本信息
|
|
|
+ ReportBaseInfoDTO reportInfo = this.buildReportInfo(params);
|
|
|
// 解析报告中主体基金的基本信息
|
|
|
- ReportFundInfoDTO reportFundInfo = this.parseFundInfo(params);
|
|
|
+ ReportFundInfoDTO reportFundInfo = this.buildFundInfo(params);
|
|
|
// 解析其他表格信息并且设置结果字段
|
|
|
- T reportData = this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
|
|
|
+ T reportData = this.parseExtInfoAndSetData(reportInfo, reportFundInfo);
|
|
|
// 数据清洗后返回
|
|
|
this.cleaningReportData(reportData);
|
|
|
return reportData;
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * 初始化解析所有表格数据
|
|
|
+ *
|
|
|
+ * @param tables 按固定的表格模式划分到不同的对象中
|
|
|
+ */
|
|
|
protected abstract void initTableInfo(List<Table> tables);
|
|
|
|
|
|
- protected abstract ReportFundInfoDTO parseFundInfo(ReportParserParams params);
|
|
|
+ /**
|
|
|
+ * 绑定基金基本信息(年报的基金基本信息解析逻辑要覆盖重写)
|
|
|
+ *
|
|
|
+ * @param params /
|
|
|
+ * @return /
|
|
|
+ */
|
|
|
+ protected ReportFundInfoDTO buildFundInfo(ReportParserParams params) {
|
|
|
+ Table fundInfoTable = this.fundInfoTable;
|
|
|
+ if (fundInfoTable == null) {
|
|
|
+ throw new APIException("未解析到基本信息表格");
|
|
|
+ }
|
|
|
+ // 基金基本信息映射
|
|
|
+ return this.buildDto(params.getFileId(), fundInfoTable, ReportFundInfoDTO.class, this::parseFundInfo);
|
|
|
+ }
|
|
|
|
|
|
- protected abstract T parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params);
|
|
|
+ /**
|
|
|
+ * 解析基金基本信息表格
|
|
|
+ *
|
|
|
+ * @param fundInfoTable 表格
|
|
|
+ * @return /
|
|
|
+ */
|
|
|
+ protected abstract Map<String, Object> parseFundInfo(Table fundInfoTable);
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 解析报告的其他信息并设置到对象中
|
|
|
+ *
|
|
|
+ * @param reportInfo 报告基本信息
|
|
|
+ * @param fundInfo 报告中基金基本信息
|
|
|
+ * @return /
|
|
|
+ */
|
|
|
+ protected abstract T parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo);
|
|
|
|
|
|
@Override
|
|
|
protected void cleaningReportData(T reportData) {
|
|
@@ -94,13 +129,13 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
/**
|
|
|
* 构建报告基本信息
|
|
|
*
|
|
|
- * @param fileId 文件id
|
|
|
- * @param reportName 报告名称
|
|
|
+ * @param params /
|
|
|
* @return /
|
|
|
*/
|
|
|
- protected ReportBaseInfoDTO buildReportInfo(Integer fileId, String reportName) {
|
|
|
- ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO();
|
|
|
- reportInfo.setFileId(fileId);
|
|
|
+ private ReportBaseInfoDTO buildReportInfo(ReportParserParams params) {
|
|
|
+ Integer fileId = params.getFileId();
|
|
|
+ String reportName = params.getFilename();
|
|
|
+ ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO(fileId);
|
|
|
reportInfo.setReportName(reportName);
|
|
|
reportInfo.setReportType(this.matchReportType(reportName));
|
|
|
reportInfo.setReportDate(this.matchReportDate(reportName));
|
|
@@ -108,6 +143,56 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
+ * 构建只有两列表格的dto数据对象,如果有分级基金时
|
|
|
+ *
|
|
|
+ * @param <DTO> 泛型对象
|
|
|
+ * @param fileId 文件id
|
|
|
+ * @param tables 表格
|
|
|
+ * @param clazz 泛型对象
|
|
|
+ * @param function 表格转换的函数
|
|
|
+ * @return /
|
|
|
+ */
|
|
|
+ protected <DTO extends BaseReportLevelDTO<?>> List<DTO> buildLevelDto(Integer fileId, List<Table> tables, Class<DTO> clazz,
|
|
|
+ Function<Table, Map<String, Object>> function) {
|
|
|
+ // 映射转换
|
|
|
+ List<DTO> dtos = tables.stream().filter(Objects::nonNull)
|
|
|
+ .map(e -> this.buildDto(fileId, e, clazz, function)).collect(Collectors.toList());
|
|
|
+ // 分级基金匹配
|
|
|
+ List<String> levels = this.matchTieredFund(String.join(",", this.textList));
|
|
|
+ levels.add(0, "母基金");
|
|
|
+ for (int i = 0; i < dtos.size(); i++) {
|
|
|
+ if (levels.size() <= i) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ dtos.get(i).setLevel(levels.get(i));
|
|
|
+ }
|
|
|
+ return dtos;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 构建只有两列表格的dto数据对象
|
|
|
+ *
|
|
|
+ * @param <DTO> 泛型对象
|
|
|
+ * @param fileId 文件id
|
|
|
+ * @param table 表格
|
|
|
+ * @param clazz 泛型对象
|
|
|
+ * @param function 表格转换的函数
|
|
|
+ * @return /
|
|
|
+ */
|
|
|
+ private <DTO extends BaseReportDTO<?>> DTO buildDto(Integer fileId, Table table, Class<DTO> clazz,
|
|
|
+ Function<Table, Map<String, Object>> function) {
|
|
|
+ try {
|
|
|
+ Map<String, Object> extInfoMap = function == null ? MapUtil.empty() : function.apply(table);
|
|
|
+ DTO dto = clazz.getDeclaredConstructor().newInstance();
|
|
|
+ dto.setFileId(fileId);
|
|
|
+ this.buildInfo(extInfoMap, dto);
|
|
|
+ return dto;
|
|
|
+ } catch (Exception ignored) {
|
|
|
+ }
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
* 匹配分级基金名称
|
|
|
*
|
|
|
* @param text 文本内容
|
|
@@ -135,38 +220,6 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * 匹配报告名称
|
|
|
- *
|
|
|
- * @param text 文本内容
|
|
|
- * @return /
|
|
|
- */
|
|
|
- private String matchReportName(String text) {
|
|
|
- if (StrUtil.isBlank(text)) {
|
|
|
- return null;
|
|
|
- }
|
|
|
- // 编译正则表达式模式
|
|
|
- Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
|
|
|
- Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
|
|
|
- Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
|
|
|
- // 创建Matcher对象
|
|
|
- Matcher matcher1 = pat1.matcher(text);
|
|
|
- Matcher matcher2 = pat2.matcher(text);
|
|
|
- Matcher matcher3 = pat3.matcher(text);
|
|
|
- // 尝试匹配
|
|
|
- String reportName;
|
|
|
- if (matcher1.find()) {
|
|
|
- reportName = matcher1.group();
|
|
|
- } else if (matcher2.find()) {
|
|
|
- reportName = matcher2.group();
|
|
|
- } else if (matcher3.find()) {
|
|
|
- reportName = matcher3.group();
|
|
|
- } else {
|
|
|
- reportName = text;
|
|
|
- }
|
|
|
- return reportName.replace("(", "(").replace(")", ")").trim();
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
* 匹配报告日期
|
|
|
*
|
|
|
* @param string 文本内容
|
|
@@ -181,11 +234,13 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); // 2023-12-31
|
|
|
Pattern pat3 = Pattern.compile("(2\\d{3})年年度"); // 2023年年度
|
|
|
Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月"); // 2023年12月
|
|
|
+ Pattern pat5 = Pattern.compile("\\d{4}\\d{2}\\d{2}"); // 20231231
|
|
|
// 创建Matcher对象
|
|
|
Matcher matcher1 = pat1.matcher(string);
|
|
|
Matcher matcher2 = pat2.matcher(string);
|
|
|
Matcher matcher3 = pat3.matcher(string);
|
|
|
Matcher matcher4 = pat4.matcher(string);
|
|
|
+ Matcher matcher5 = pat5.matcher(string);
|
|
|
// 尝试匹配
|
|
|
if (matcher1.find()) {
|
|
|
String year = matcher1.group(1);
|
|
@@ -199,6 +254,8 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
};
|
|
|
} else if (matcher2.find()) {
|
|
|
return matcher2.group();
|
|
|
+ } else if (matcher5.find()) {
|
|
|
+ return matcher5.group();
|
|
|
} else if (matcher3.find()) {
|
|
|
return matcher3.group(1) + "-12-31";
|
|
|
} else if (matcher4.find()) {
|
|
@@ -221,8 +278,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
if (string == null) {
|
|
|
return null;
|
|
|
}
|
|
|
+ // 所有报告的正则识别方式
|
|
|
+ String patterns = ReportType.getAllPatterns();
|
|
|
// 编译正则表达式模式
|
|
|
- Pattern pattern = Pattern.compile("月|季度|年度");
|
|
|
+ Pattern pattern = Pattern.compile(patterns);
|
|
|
// 创建Matcher对象
|
|
|
Matcher matcher = pattern.matcher(string);
|
|
|
// 尝试匹配
|