|
@@ -1,7 +1,9 @@
|
|
package com.simuwang.daq.components.report.parser.pdf;
|
|
package com.simuwang.daq.components.report.parser.pdf;
|
|
|
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
import cn.hutool.core.collection.CollUtil;
|
|
|
|
+import cn.hutool.core.collection.ListUtil;
|
|
import cn.hutool.core.util.StrUtil;
|
|
import cn.hutool.core.util.StrUtil;
|
|
|
|
+import com.simuwang.base.common.conts.Constants;
|
|
import com.simuwang.base.common.exception.APIException;
|
|
import com.simuwang.base.common.exception.APIException;
|
|
import com.simuwang.base.mapper.EmailFieldMappingMapper;
|
|
import com.simuwang.base.mapper.EmailFieldMappingMapper;
|
|
import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
|
|
import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
|
|
@@ -24,6 +26,7 @@ import java.util.Calendar;
|
|
import java.util.List;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import java.util.regex.Pattern;
|
|
|
|
+import java.util.stream.Collectors;
|
|
|
|
|
|
/**
|
|
/**
|
|
* @author wangzaijun
|
|
* @author wangzaijun
|
|
@@ -31,6 +34,11 @@ import java.util.regex.Pattern;
|
|
* @description pdf格式的报告解析抽象类
|
|
* @description pdf格式的报告解析抽象类
|
|
*/
|
|
*/
|
|
public abstract class AbstractPDReportParser<T extends ReportData> extends AbstractReportParser<T> {
|
|
public abstract class AbstractPDReportParser<T extends ReportData> extends AbstractReportParser<T> {
|
|
|
|
+ /**
|
|
|
|
+ * 去除了水印的所有文本内容
|
|
|
|
+ */
|
|
|
|
+ protected List<String> textList;
|
|
|
|
+
|
|
public AbstractPDReportParser(EmailFieldMappingMapper fieldMappingMapper) {
|
|
public AbstractPDReportParser(EmailFieldMappingMapper fieldMappingMapper) {
|
|
super(fieldMappingMapper);
|
|
super(fieldMappingMapper);
|
|
}
|
|
}
|
|
@@ -41,29 +49,35 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
// 解析报告名称和表格
|
|
// 解析报告名称和表格
|
|
String reportName = null;
|
|
String reportName = null;
|
|
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
|
|
try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
|
|
- CustomPDFTextStripper stripper = new CustomPDFTextStripper(document, 1);
|
|
|
|
|
|
+ CustomPDFTextStripper stripper = new CustomPDFTextStripper();
|
|
stripper.setSortByPosition(true);
|
|
stripper.setSortByPosition(true);
|
|
- List<String> textList = StrUtil.split(stripper.getText(document), System.lineSeparator());
|
|
|
|
- textList.removeIf(StrUtil::isBlank);
|
|
|
|
- if (CollUtil.isNotEmpty(textList)) {
|
|
|
|
- reportName = this.matchReportName(textList.get(0));
|
|
|
|
|
|
+ String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, System.lineSeparator());
|
|
|
|
+ this.textList = StrUtil.split(text, System.lineSeparator());
|
|
|
|
+ this.textList.removeIf(StrUtil::isBlank);
|
|
|
|
+ if (CollUtil.isNotEmpty(this.textList)) {
|
|
|
|
+ reportName = this.matchReportName(this.textList.get(0));
|
|
if (StrUtil.isBlank(reportName)) {
|
|
if (StrUtil.isBlank(reportName)) {
|
|
throw new APIException("未匹配到报告名称");
|
|
throw new APIException("未匹配到报告名称");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// 解析所有表格
|
|
// 解析所有表格
|
|
|
|
+ List<Table> tables = ListUtil.list(true);
|
|
SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
|
|
+ // 自定义表格提取工具,去除单元格中的水印文字
|
|
PageIterator pageIterator = new CustomObjectExtractor(document).extract();
|
|
PageIterator pageIterator = new CustomObjectExtractor(document).extract();
|
|
while (pageIterator.hasNext()) {
|
|
while (pageIterator.hasNext()) {
|
|
Page page = pageIterator.next();
|
|
Page page = pageIterator.next();
|
|
- List<Table> tables = extractionAlgorithm.extract(page);
|
|
|
|
- this.initTableInfo(tables);
|
|
|
|
|
|
+ tables.addAll(extractionAlgorithm.extract(page));
|
|
}
|
|
}
|
|
|
|
+ this.initTableInfo(tables);
|
|
}
|
|
}
|
|
// 解析报告中主体基金的基本信息
|
|
// 解析报告中主体基金的基本信息
|
|
ReportFundInfoDTO reportFundInfo = this.parseFundInfo(params);
|
|
ReportFundInfoDTO reportFundInfo = this.parseFundInfo(params);
|
|
// 解析其他表格信息并且设置结果字段
|
|
// 解析其他表格信息并且设置结果字段
|
|
- return this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
|
|
|
|
|
|
+ T reportData = this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
|
|
|
|
+ // 数据清洗后返回
|
|
|
|
+ this.cleaningReportData(reportData);
|
|
|
|
+ return reportData;
|
|
}
|
|
}
|
|
|
|
|
|
protected abstract void initTableInfo(List<Table> tables);
|
|
protected abstract void initTableInfo(List<Table> tables);
|
|
@@ -72,6 +86,11 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
|
|
|
|
protected abstract T parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params);
|
|
protected abstract T parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params);
|
|
|
|
|
|
|
|
+ @Override
|
|
|
|
+ protected void cleaningReportData(T reportData) {
|
|
|
|
+ // cleaning.
|
|
|
|
+ }
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* 构建报告基本信息
|
|
* 构建报告基本信息
|
|
*
|
|
*
|
|
@@ -89,6 +108,33 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
|
|
+ * 匹配分级基金名称
|
|
|
|
+ *
|
|
|
|
+ * @param text 文本内容
|
|
|
|
+ * @return /
|
|
|
|
+ */
|
|
|
|
+ protected List<String> matchTieredFund(String text) {
|
|
|
|
+ List<String> matches = ListUtil.list(false);
|
|
|
|
+ if (StrUtil.isBlank(text)) {
|
|
|
|
+ return matches;
|
|
|
|
+ }
|
|
|
|
+ // 使用正则表达式查找匹配项
|
|
|
|
+ Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
|
|
|
|
+ Matcher matcher = pattern.matcher(text);
|
|
|
|
+ // 收集所有匹配项
|
|
|
|
+ while (matcher.find()) {
|
|
|
|
+ matches.add(matcher.group());
|
|
|
|
+ }
|
|
|
|
+ // 提取字母并按字母顺序排序
|
|
|
|
+ return matches.stream()
|
|
|
|
+ .map(s -> s.replaceAll("[^A-F]", ""))
|
|
|
|
+ .distinct()
|
|
|
|
+ .sorted()
|
|
|
|
+ .map(letter -> letter + "级")
|
|
|
|
+ .collect(Collectors.toList());
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
* 匹配报告名称
|
|
* 匹配报告名称
|
|
*
|
|
*
|
|
* @param text 文本内容
|
|
* @param text 文本内容
|
|
@@ -102,12 +148,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
|
|
Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
|
|
Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
|
|
Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
|
|
Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
|
|
Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
|
|
-
|
|
|
|
// 创建Matcher对象
|
|
// 创建Matcher对象
|
|
Matcher matcher1 = pat1.matcher(text);
|
|
Matcher matcher1 = pat1.matcher(text);
|
|
Matcher matcher2 = pat2.matcher(text);
|
|
Matcher matcher2 = pat2.matcher(text);
|
|
Matcher matcher3 = pat3.matcher(text);
|
|
Matcher matcher3 = pat3.matcher(text);
|
|
-
|
|
|
|
// 尝试匹配
|
|
// 尝试匹配
|
|
String reportName;
|
|
String reportName;
|
|
if (matcher1.find()) {
|
|
if (matcher1.find()) {
|
|
@@ -132,19 +176,16 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
if (string == null) {
|
|
if (string == null) {
|
|
return null;
|
|
return null;
|
|
}
|
|
}
|
|
-
|
|
|
|
// 编译正则表达式模式
|
|
// 编译正则表达式模式
|
|
Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度"); // 2023年XXX3季度
|
|
Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度"); // 2023年XXX3季度
|
|
Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); // 2023-12-31
|
|
Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}"); // 2023-12-31
|
|
Pattern pat3 = Pattern.compile("(2\\d{3})年年度"); // 2023年年度
|
|
Pattern pat3 = Pattern.compile("(2\\d{3})年年度"); // 2023年年度
|
|
Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月"); // 2023年12月
|
|
Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月"); // 2023年12月
|
|
-
|
|
|
|
// 创建Matcher对象
|
|
// 创建Matcher对象
|
|
Matcher matcher1 = pat1.matcher(string);
|
|
Matcher matcher1 = pat1.matcher(string);
|
|
Matcher matcher2 = pat2.matcher(string);
|
|
Matcher matcher2 = pat2.matcher(string);
|
|
Matcher matcher3 = pat3.matcher(string);
|
|
Matcher matcher3 = pat3.matcher(string);
|
|
Matcher matcher4 = pat4.matcher(string);
|
|
Matcher matcher4 = pat4.matcher(string);
|
|
-
|
|
|
|
// 尝试匹配
|
|
// 尝试匹配
|
|
if (matcher1.find()) {
|
|
if (matcher1.find()) {
|
|
String year = matcher1.group(1);
|
|
String year = matcher1.group(1);
|
|
@@ -180,13 +221,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
|
|
if (string == null) {
|
|
if (string == null) {
|
|
return null;
|
|
return null;
|
|
}
|
|
}
|
|
-
|
|
|
|
// 编译正则表达式模式
|
|
// 编译正则表达式模式
|
|
Pattern pattern = Pattern.compile("月|季度|年度");
|
|
Pattern pattern = Pattern.compile("月|季度|年度");
|
|
-
|
|
|
|
// 创建Matcher对象
|
|
// 创建Matcher对象
|
|
Matcher matcher = pattern.matcher(string);
|
|
Matcher matcher = pattern.matcher(string);
|
|
-
|
|
|
|
// 尝试匹配
|
|
// 尝试匹配
|
|
if (matcher.find()) {
|
|
if (matcher.find()) {
|
|
return matcher.group();
|
|
return matcher.group();
|