|
@@ -3,16 +3,12 @@ package com.simuwang.daq.components.report.parser.pdf;
|
|
|
import cn.hutool.core.collection.CollUtil;
|
|
|
import cn.hutool.core.collection.ListUtil;
|
|
|
import cn.hutool.core.map.MapUtil;
|
|
|
-import cn.hutool.core.util.StrUtil;
|
|
|
import com.simuwang.base.mapper.EmailFieldMappingMapper;
|
|
|
import com.simuwang.base.pojo.dto.report.*;
|
|
|
import com.simuwang.daq.components.report.parser.ReportParserConstant;
|
|
|
import org.springframework.stereotype.Component;
|
|
|
-import technology.tabula.RectangularTextContainer;
|
|
|
import technology.tabula.Table;
|
|
|
|
|
|
-import java.awt.geom.Rectangle2D;
|
|
|
-import java.util.Comparator;
|
|
|
import java.util.List;
|
|
|
import java.util.Map;
|
|
|
import java.util.function.Function;
|
|
@@ -23,12 +19,20 @@ import java.util.function.Function;
|
|
|
* @description 年报解析逻辑:基本信息被拆分为多个表格,财务报表未解析
|
|
|
*/
|
|
|
@Component(ReportParserConstant.PARSER_PDF_ANNUALLY)
|
|
|
-public class PDAnnuallyReportParser extends AbstractPDReportParser<AnnuallyReportData> {
|
|
|
+public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyReportData> {
|
|
|
+ private static final List<String> FINANCIAL_INDICATORS_COLUMN_NAMES = ListUtil.list(false);
|
|
|
+
|
|
|
+ static {
|
|
|
+ FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
|
|
|
+ FINANCIAL_INDICATORS_COLUMN_NAMES.add("报告期期末单位净值");
|
|
|
+ FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期利润");
|
|
|
+ FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期已实现收益");
|
|
|
+ FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配利润");
|
|
|
+ FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配基金份额利润");
|
|
|
+ FINANCIAL_INDICATORS_COLUMN_NAMES.add("基金份额累计净值增长率");
|
|
|
+ }
|
|
|
+
|
|
|
private final List<Table> fundInfoTables = ListUtil.list(true);
|
|
|
- private final List<Table> shareChangeTables = ListUtil.list(true);
|
|
|
- private final List<Table> assetAllocationTables = ListUtil.list(true);
|
|
|
- private final List<Table> investmentIndustryTables = ListUtil.list(true);
|
|
|
- private final List<Table> financialIndicatorsTables = ListUtil.list(true);
|
|
|
|
|
|
public PDAnnuallyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
|
|
|
super(fieldMappingMapper);
|
|
@@ -49,41 +53,35 @@ public class PDAnnuallyReportParser extends AbstractPDReportParser<AnnuallyRepor
|
|
|
}
|
|
|
int colCount = table.getColCount();
|
|
|
if (colCount == 2) {
|
|
|
- // 用表格的第二行第一列的数据判断是否份额变动记录
|
|
|
- String text = this.cleaningValue(table.getCell(1, 0).getText());
|
|
|
- if (StrUtil.contains(text, "份额")) {
|
|
|
+ // 用表格的第一列的数据判断是否份额变动记录
|
|
|
+ List<String> texts = this.getTableColTexts(table, 0);
|
|
|
+ if (CollUtil.containsAny(texts, SHARE_CHANGE_COLUMN_NAMES)) {
|
|
|
this.shareChangeTables.add(table);
|
|
|
}
|
|
|
} else if (colCount == 4) {
|
|
|
- // 用表格的第一行第一列的数据判断是否主要财务指标数据
|
|
|
- String text = this.cleaningValue(table.getCell(0, 0).getText());
|
|
|
- if (StrUtil.contains(text, "期间数据和指标")) {
|
|
|
- this.financialIndicatorsTables.add(table);
|
|
|
- continue;
|
|
|
- }
|
|
|
- // 用表格的第一行第二列的数据判断是否行业配置数据
|
|
|
- text = this.cleaningValue(table.getCell(0, 1).getText());
|
|
|
- if (StrUtil.contains(text, "行业类别")) {
|
|
|
+ // 用表格的第二列的数据判断是否行业配置数据(内地)
|
|
|
+ List<String> texts = this.getTableColTexts(table, 1);
|
|
|
+ if (CollUtil.containsAny(texts, INDUSTRY_COLUMN_NAMES)) {
|
|
|
this.investmentIndustryTables.add(table);
|
|
|
}
|
|
|
} else if (colCount == 3) {
|
|
|
- // 用表格的第一行第一列的数据判断是否行业配置数据
|
|
|
- String text = this.cleaningValue(table.getCell(0, 0).getText());
|
|
|
- if (StrUtil.contains(text, "行业类别")) {
|
|
|
+ // 用表格的第一列的数据判断是否行业配置数据(港股通)
|
|
|
+ List<String> texts = this.getTableColTexts(table, 0);
|
|
|
+ if (CollUtil.containsAny(texts, INDUSTRY_COLUMN_NAMES)) {
|
|
|
this.investmentIndustryTables.add(table);
|
|
|
continue;
|
|
|
}
|
|
|
// 资产配置表格识别(兼容跨页的表格)获取表格中第二列的所有文字,判断所有文字中包含"股权投资"等字符串
|
|
|
- List<String> details = ListUtil.list(false);
|
|
|
- for (@SuppressWarnings("all") List<RectangularTextContainer> row : table.getRows()) {
|
|
|
- String detail = this.cleaningValue(row.get(1).getText(), false);
|
|
|
- if (StrUtil.isNotBlank(detail)) {
|
|
|
- details.add(detail);
|
|
|
- }
|
|
|
- }
|
|
|
- if (CollUtil.containsAny(details, ListUtil.of("股权投资", "股票投资", "债券投资"))) {
|
|
|
+ texts = this.getTableColTexts(table, 1);
|
|
|
+ if (CollUtil.containsAny(texts, ListUtil.of("股权投资", "股票投资", "债券投资", "另类投资", "其他资产", "其他融资总额"))) {
|
|
|
this.assetAllocationTables.add(table);
|
|
|
}
|
|
|
+ } else {
|
|
|
+ // 用表格的第一列的数据判断是否主要财务指标数据
|
|
|
+ List<String> texts = this.getTableColTexts(table, 0);
|
|
|
+ if (CollUtil.containsAny(texts, ListUtil.of("期间数据和指标", "期末数据和指标", "累计期末指标"))) {
|
|
|
+ this.financialIndicatorsTables.add(table);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -101,46 +99,17 @@ public class PDAnnuallyReportParser extends AbstractPDReportParser<AnnuallyRepor
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
- protected Map<String, Object> parseFundInfo(Table fundInfoTable) {
|
|
|
- // 季报和年报的基金基本信息是两列的表格
|
|
|
- Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
|
|
|
- for (int i = 0; i < fundInfoTable.getRows().size(); i++) {
|
|
|
- @SuppressWarnings("all")
|
|
|
- List<RectangularTextContainer> cols = fundInfoTable.getRows().get(i);
|
|
|
- for (int j = 0; j < 1; j++) {
|
|
|
- baseInfoMap.put(cols.get(j).getText(), cols.get(j + 1).getText());
|
|
|
- }
|
|
|
- }
|
|
|
- return baseInfoMap;
|
|
|
- }
|
|
|
-
|
|
|
- @Override
|
|
|
- protected AnnuallyReportData parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo) {
|
|
|
- Integer fileId = reportInfo.getFileId();
|
|
|
- // 表格转换数据获取函数
|
|
|
- Function<Table, Map<String, Object>> function = t -> {
|
|
|
- Map<String, Object> extInfoMap = MapUtil.newHashMap(16);
|
|
|
- for (int i = 0; i < t.getRowCount(); i++) {
|
|
|
- String key = t.getCell(i, 0).getText();
|
|
|
- String value = t.getCell(i, 1).getText();
|
|
|
- extInfoMap.put(key, value);
|
|
|
- }
|
|
|
- return extInfoMap;
|
|
|
- };
|
|
|
- // 份额变动
|
|
|
- List<ReportShareChangeDTO> shareChanges = this.buildLevelDto(fileId, this.shareChangeTables,
|
|
|
- ReportShareChangeDTO.class, function);
|
|
|
-// // 主要财务指标
|
|
|
-// List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildLevelDto(fileId, this.financialIndicatorsTables,
|
|
|
-// ReportFinancialIndicatorsDTO.class, function);
|
|
|
- // 资产配置
|
|
|
- List<ReportAssetAllocationDTO> assetAllocations = this.buildAssetAllocationInfo(fileId);
|
|
|
- // 行业配置
|
|
|
- List<ReportInvestmentIndustryDTO> investmentIndustries = this.buildInvestmentIndustryInfo(fileId);
|
|
|
+ protected AnnuallyReportData buildExtData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo,
|
|
|
+ List<ReportShareChangeDTO> shareChanges,
|
|
|
+ List<ReportAssetAllocationDTO> assetAllocations,
|
|
|
+ List<ReportInvestmentIndustryDTO> investmentIndustries,
|
|
|
+ Function<Table, Map<String, Object>> function) {
|
|
|
+ // 处理财务指标
|
|
|
+ List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildFinancialIndicatorsInfo(reportInfo.getFileId());
|
|
|
// 返回数据构建
|
|
|
AnnuallyReportData reportData = new AnnuallyReportData(reportInfo, fundInfo);
|
|
|
reportData.setShareChange(shareChanges);
|
|
|
- reportData.setFinancialIndicators(null); // todo 财务指标
|
|
|
+ reportData.setFinancialIndicators(financialIndicators);
|
|
|
reportData.setAssetAllocation(assetAllocations);
|
|
|
reportData.setInvestmentIndustry(investmentIndustries);
|
|
|
return reportData;
|
|
@@ -151,84 +120,28 @@ public class PDAnnuallyReportParser extends AbstractPDReportParser<AnnuallyRepor
|
|
|
// todo 数据清洗
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * 构建基金行业配置解析数据
|
|
|
- *
|
|
|
- * @return /
|
|
|
- */
|
|
|
- private List<ReportInvestmentIndustryDTO> buildInvestmentIndustryInfo(Integer fileId) {
|
|
|
- List<ReportInvestmentIndustryDTO> dtos = ListUtil.list(false);
|
|
|
- for (Table table : this.investmentIndustryTables) {
|
|
|
- int colCount = table.getColCount();
|
|
|
- // 投资地区: 1-境内, 2-港股通
|
|
|
- int investType = colCount == 4 ? 1 : 2;
|
|
|
- int j = colCount == 4 ? 1 : 0;
|
|
|
- // 按行遍历
|
|
|
- for (int i = 0; i < table.getRowCount(); i++) {
|
|
|
- String text = this.cleaningValue(table.getCell(i, 0).getText());
|
|
|
- if (StrUtil.containsAny(text, "序号", "行业类别")) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- ReportInvestmentIndustryDTO dto = new ReportInvestmentIndustryDTO(fileId);
|
|
|
- dto.setInvestType(investType);
|
|
|
- dto.setIndustryName(this.cleaningValue(table.getCell(i, j).getText()));
|
|
|
- dto.setMarketValue(this.cleaningValue(table.getCell(i, j + 1).getText()));
|
|
|
- dto.setRatio(this.cleaningValue(table.getCell(i, j + 2).getText()));
|
|
|
- dtos.add(dto);
|
|
|
- }
|
|
|
- }
|
|
|
- return dtos;
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 构建基金资产配置解析数据
|
|
|
- *
|
|
|
- * @param fileId 文件id
|
|
|
- * @return /
|
|
|
- */
|
|
|
- private List<ReportAssetAllocationDTO> buildAssetAllocationInfo(Integer fileId) {
|
|
|
- List<ReportAssetAllocationDTO> dtos = ListUtil.list(false);
|
|
|
- String assetType = null;
|
|
|
- for (Table table : this.assetAllocationTables) {
|
|
|
- // 按行遍历
|
|
|
- for (@SuppressWarnings("all") List<RectangularTextContainer> row : table.getRows()) {
|
|
|
- // x坐标升序(防止部分行乱序问题)
|
|
|
- row.sort(Comparator.comparing(Rectangle2D.Float::getX));
|
|
|
- // 大类
|
|
|
- String type = this.cleaningValue(row.get(0).getText());
|
|
|
- if (StrUtil.isNotBlank(type)) {
|
|
|
- assetType = type;
|
|
|
- }
|
|
|
- // 金额、市值,有时是 “备注#金额”的格式
|
|
|
- String marketValueAndRemark = this.cleaningValue(row.get(2).getText());
|
|
|
- if (StrUtil.isBlank(marketValueAndRemark) || StrUtil.isBlank(assetType)) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- // 资产明细
|
|
|
- String detail = this.cleaningValue(row.get(1).getText(), false);
|
|
|
- if (StrUtil.contains(marketValueAndRemark, "#")) {
|
|
|
- // 有#表示有备注,而且可能有多个,多个用分号分隔的.
|
|
|
- List<String> marketValueAndRemarks = StrUtil.split(marketValueAndRemark, ";");
|
|
|
- for (String mr : marketValueAndRemarks) {
|
|
|
- if (StrUtil.isBlank(mr)) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- List<String> mrs = StrUtil.split(mr, "#");
|
|
|
- ReportAssetAllocationDTO dto = new ReportAssetAllocationDTO(fileId);
|
|
|
- dto.setAssetType(assetType);
|
|
|
- dto.setAssetDetails(detail);
|
|
|
- dto.setMarketValue(mrs.get(1));
|
|
|
- dto.setRemark(mrs.get(0));
|
|
|
- dtos.add(dto);
|
|
|
+ private List<ReportFinancialIndicatorsDTO> buildFinancialIndicatorsInfo(Integer fileId) {
|
|
|
+ List<ReportFinancialIndicatorsDTO> dtos = ListUtil.list(false);
|
|
|
+ // 这里不存在分级基金,可能会存在表格跨页
|
|
|
+ int colCount = this.financialIndicatorsTables.get(0).getColCount();
|
|
|
+ for (int j = 1; j < colCount; j++) {
|
|
|
+ Map<String, Object> infoMap = MapUtil.newHashMap(16);
|
|
|
+ for (Table table : this.financialIndicatorsTables) {
|
|
|
+ String year = this.cleaningValue(table.getCell(0, j).getText());
|
|
|
+ infoMap.put("年度", year);
|
|
|
+ for (int i = 0; i < table.getRowCount(); i++) {
|
|
|
+ String columnName = this.cleaningValue(table.getCell(i, 0).getText());
|
|
|
+ if (!CollUtil.contains(FINANCIAL_INDICATORS_COLUMN_NAMES, columnName)) {
|
|
|
+ continue;
|
|
|
}
|
|
|
- } else {
|
|
|
- ReportAssetAllocationDTO dto = new ReportAssetAllocationDTO(fileId);
|
|
|
- dto.setAssetType(assetType);
|
|
|
- dto.setAssetDetails(detail);
|
|
|
- dto.setMarketValue(marketValueAndRemark);
|
|
|
- dtos.add(dto);
|
|
|
+ String value = this.cleaningValue(table.getCell(i, j).getText());
|
|
|
+ infoMap.put(columnName, value);
|
|
|
}
|
|
|
}
|
|
|
+ ReportFinancialIndicatorsDTO dto = new ReportFinancialIndicatorsDTO(fileId);
|
|
|
+ this.buildInfo(infoMap, dto);
|
|
|
+ dto.setLevel("母基金");
|
|
|
+ dtos.add(dto);
|
|
|
}
|
|
|
return dtos;
|
|
|
}
|