wangzaijun hai 6 meses
pai
achega
1e4b1e6f21
Modificáronse 22 ficheiros con 1373 adicións e 736 borrados
  1. 3 7
      service-base/src/main/java/com/simuwang/base/common/enums/ReportParserFileType.java
  2. 7 1
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/BaseReportDTO.java
  3. 81 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/CustomExcelTable.java
  4. 7 1
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportData.java
  5. 0 12
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportInvestmentIndustryDTO.java
  6. 6 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportParseStatus.java
  7. 137 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/SimpleTable.java
  8. 89 0
      service-daq/src/main/java/com/simuwang/daq/components/CustomExcelMultiSheetListener.java
  9. 294 0
      service-daq/src/main/java/com/simuwang/daq/components/ReportParseUtils.java
  10. 36 29
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java
  11. 12 24
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParserConstant.java
  12. 125 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/AbstractExcelReportParser.java
  13. 243 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/ExcelAnnuallyReportParser.java
  14. 79 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/ExcelMonthlyReportParser.java
  15. 221 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/ExcelQuarterlyReportParser.java
  16. 2 154
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java
  17. 11 11
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java
  18. 15 127
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java
  19. BIN=BIN
      service-daq/src/main/java/com/simuwang/daq/utils/12931.pdf
  20. BIN=BIN
      service-daq/src/main/java/com/simuwang/daq/utils/2061834.pdf
  21. 0 370
      service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java
  22. 5 0
      service-deploy/src/test/java/com/simuwang/ApplicationTest.java

+ 3 - 7
service-base/src/main/java/com/simuwang/base/common/enums/ReportParserFileType.java

@@ -1,7 +1,5 @@
 package com.simuwang.base.common.enums;
 
-import cn.hutool.core.util.StrUtil;
-
 import java.util.Arrays;
 
 /**
@@ -11,10 +9,8 @@ import java.util.Arrays;
  */
 public enum ReportParserFileType {
     PDF("pdf"),
-    DOCX("docx"),
-    DOC("doc"),
-    XLSX("xlsx"),
-    XLS("xls"),
+    WORD("docx,doc"),
+    EXCEL("xlsx,xls"),
     PYTHON("python");
 
     private final String suffix;
@@ -25,7 +21,7 @@ public enum ReportParserFileType {
 
     public static ReportParserFileType getBySuffix(String suffix) {
         return Arrays.stream(ReportParserFileType.values())
-                .filter(e -> StrUtil.equals(e.getSuffix(), suffix)).findFirst().orElse(null);
+                .filter(e -> e.getSuffix().contains(suffix)).findFirst().orElse(null);
     }
 
     public String getSuffix() {

+ 7 - 1
service-base/src/main/java/com/simuwang/base/pojo/dto/report/BaseReportDTO.java

@@ -3,10 +3,13 @@ package com.simuwang.base.pojo.dto.report;
 import cn.hutool.core.date.DatePattern;
 import cn.hutool.core.date.DateUtil;
 import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.conts.Constants;
 import com.simuwang.base.pojo.dos.report.BaseReportDO;
 import lombok.Getter;
 import lombok.Setter;
 
+import java.io.Serial;
+import java.io.Serializable;
 import java.math.BigDecimal;
 import java.util.Date;
 
@@ -17,7 +20,10 @@ import java.util.Date;
  */
 @Setter
 @Getter
-public abstract class BaseReportDTO<T extends BaseReportDO> {
+public abstract class BaseReportDTO<T extends BaseReportDO> implements Serializable {
+    @Serial
+    private static final long serialVersionUID = Constants.DEFAULT_SERIAL_ID;
+
     private Integer fileId;
 
     public BaseReportDTO() {

+ 81 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/CustomExcelTable.java

@@ -0,0 +1,81 @@
+package com.simuwang.base.pojo.dto.report;
+
+import com.simuwang.base.common.conts.Constants;
+
+import java.io.Serial;
+import java.io.Serializable;
+
+/**
+ * @author wangzaijun
+ * @date 2024/10/14 8:52
+ * @description 自定义传递的表格对象
+ */
+public class CustomExcelTable implements Serializable {
+    @Serial
+    private static final long serialVersionUID = Constants.DEFAULT_SERIAL_ID;
+    /**
+     * 表格标识,数据入表的逻辑判断,不能为空
+     */
+    private final String key;
+    /**
+     * 表格识别标题(equals或者contains识别),不能为空,多个用逗号分隔
+     */
+    private final String titles;
+    /**
+     * 表格识别列数(不能小于等于0)
+     */
+    private final int colCount;
+    /**
+     * 表格识别的行数(除了标题行包含表头,如果为0则不限制行数)
+     */
+    private final int rowCount;
+    /**
+     * 表格识别起始列,默认第二列开始
+     */
+    private final int startCol;
+
+    public CustomExcelTable(String key, String titles, int colCount) {
+        this(key, titles, colCount, 0);
+    }
+
+    public CustomExcelTable(String key, String titles, int colCount, int rowCount) {
+        this(key, titles, colCount, rowCount, 1);
+    }
+
+    public CustomExcelTable(String key, String titles, int colCount, int rowCount, int startCol) {
+        if (key == null) {
+            throw new NullPointerException("table key is null.");
+        }
+        if (titles == null) {
+            throw new NullPointerException("table titles is null.");
+        }
+        if (colCount <= 0) {
+            throw new IndexOutOfBoundsException("table col index out of.");
+        }
+        this.key = key;
+        this.titles = titles;
+        this.colCount = colCount;
+        this.rowCount = rowCount <= 0 ? Integer.MAX_VALUE : rowCount;
+        this.startCol = Math.max(startCol, 1);
+    }
+
+    public String getKey() {
+        return key;
+    }
+
+    public String getTitles() {
+        return titles;
+    }
+
+    public int getColCount() {
+        return colCount;
+    }
+
+    public int getStartCol() {
+        return startCol;
+    }
+
+    public int getRowCount() {
+        return rowCount;
+    }
+}

+ 7 - 1
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportData.java

@@ -1,9 +1,13 @@
 package com.simuwang.base.pojo.dto.report;
 
+import com.simuwang.base.common.conts.Constants;
 import com.simuwang.base.common.enums.ReportType;
 import lombok.Getter;
 import lombok.Setter;
 
+import java.io.Serial;
+import java.io.Serializable;
+
 /**
  * @author wangzaijun
  * @date 2024/9/29 9:32
@@ -11,7 +15,9 @@ import lombok.Setter;
  */
 @Setter
 @Getter
-public abstract class ReportData {
+public abstract class ReportData implements Serializable {
+    @Serial
+    private static final long serialVersionUID = Constants.DEFAULT_SERIAL_ID;
     /**
      * 报告基本信息
      */

+ 0 - 12
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportInvestmentIndustryDTO.java

@@ -13,10 +13,6 @@ import lombok.Setter;
 @Getter
 public class ReportInvestmentIndustryDTO extends BaseReportDTO<ReportInvestmentIndustryDO> {
     /**
-     * 行业分类编码
-     */
-    private String industryCode;
-    /**
      * 行业分类名称
      */
     private String industryName;
@@ -25,10 +21,6 @@ public class ReportInvestmentIndustryDTO extends BaseReportDTO<ReportInvestmentI
      */
     private Integer investType;
     /**
-     * 行业标准编码
-     */
-    private String isbCode;
-    /**
      * 公允价值,市值
      */
     private String marketValue;
@@ -49,10 +41,8 @@ public class ReportInvestmentIndustryDTO extends BaseReportDTO<ReportInvestmentI
     public ReportInvestmentIndustryDO toEntity() {
         ReportInvestmentIndustryDO entity = new ReportInvestmentIndustryDO();
         entity.setFileId(this.getFileId());
-        entity.setIndustryCode(this.industryCode);
         entity.setIndustryName(this.industryName);
         entity.setInvestType(this.investType);
-        entity.setIsbCode(this.isbCode);
         entity.setMarketValue(this.toBigDecimal(this.marketValue));
         entity.setRatio(this.toBigDecimal(this.ratio));
         return entity;
@@ -62,10 +52,8 @@ public class ReportInvestmentIndustryDTO extends BaseReportDTO<ReportInvestmentI
     public String toString() {
         return "{" +
                 super.toString() +
-                ", industryCode='" + industryCode + '\'' +
                 ", industryName='" + industryName + '\'' +
                 ", investType=" + investType +
-                ", isbCode='" + isbCode + '\'' +
                 ", marketValue=" + marketValue +
                 ", ratio=" + ratio +
                 '}';

+ 6 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportParseStatus.java

@@ -8,7 +8,13 @@ public enum ReportParseStatus implements StatusCode {
     REPORT_IS_SCAN(21002, "报告[{}] 为扫描件"),
     NO_SUPPORT_TEMPLATE(21003, "报告[{}] 是不支持的文件格式"),
     NOT_A_FIXED_FORMAT(21004, "报告[{}] 不是基协统一格式"),
+
     PARSE_FUND_INFO_FAIL(21010, "报告[{}] 没有解析到基金基本信息"),
+    PARSE_NAV_INFO_FAIL(21011, "报告[{}] 没有解析到基金净值信息"),
+    PARSE_FINANCIAL_INFO_FAIL(21012, "报告[{}] 没有解析到基金财务指标信息"),
+    PARSE_INDUSTRY_INFO_FAIL(21013, "报告[{}] 没有解析到基金行业配置信息"),
+    PARSE_ASSET_INFO_FAIL(21014, "报告[{}] 没有解析到基金资产配置信息"),
+    PARSE_SHARE_INFO_FAIL(21015, "报告[{}] 没有解析到基金份额变动信息"),
     ;
     private final int code;
     private final String msg;

+ 137 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/SimpleTable.java

@@ -0,0 +1,137 @@
+package com.simuwang.base.pojo.dto.report;
+
+import cn.hutool.core.collection.ListUtil;
+import com.simuwang.base.common.conts.Constants;
+
+import java.io.Serial;
+import java.io.Serializable;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * @author wangzaijun
+ * @date 2024/10/12 11:00
+ * @description 自定义简单表格对象(excel工作表中的多个表格解析)
+ */
+public class SimpleTable implements Serializable {
+    @Serial
+    private static final long serialVersionUID = Constants.DEFAULT_SERIAL_ID;
+    private final String tableKey;
+    /**
+     * 表格标题
+     */
+    private final String title;
+    /**
+     * 表格数据行
+     */
+    private final List<List<String>> tables;
+    /**
+     * 表格列数
+     */
+    private int colCount;
+    /**
+     * 表格行数
+     */
+    private int rowCount;
+
+    public SimpleTable(String tableKey, String title, int colCount) {
+        this.tableKey = tableKey;
+        this.title = title;
+        this.colCount = colCount;
+        this.tables = ListUtil.list(true);
+    }
+
+    public void addRow(List<String> row) {
+        tables.add(row);
+        this.rowCount++;
+        if (this.colCount == 0) {
+            this.colCount = this.tables.stream().map(List::size).max(Comparator.naturalOrder()).orElse(0);
+        }
+    }
+
+    public int getColCount() {
+        return colCount;
+    }
+
+    public int getRowCount() {
+        return rowCount;
+    }
+
+    public String getTableKey() {
+        return tableKey;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public List<List<String>> getTables() {
+        int subRows = this.rowCount - this.tables.size();
+        if (subRows <= 0) {
+            return this.tables;
+        }
+        for (int i = 0; i < subRows; i++) {
+            List<String> row = ListUtil.list(true);
+            for (int j = 0; j < this.colCount; j++) {
+                row.add(null);
+            }
+            this.tables.add(row);
+        }
+        return this.tables;
+    }
+
+    /**
+     * 实现迭代器接口,支持按行遍历表格数据
+     *
+     * @return /
+     */
+    public Iterator<List<String>> iterator() {
+        return new TableIterator(this.getTables());
+    }
+
+    /**
+     * 获取某个单元格的数据
+     *
+     * @param row    行号
+     * @param column 列号
+     * @return 单元格内容
+     */
+    public String getCell(int row, int column) {
+        List<List<String>> rows = this.getTables();
+        if (row < 0 || row >= rows.size() || column < 0 || column >= rows.get(row).size()) {
+            throw new IndexOutOfBoundsException("Invalid row or column index");
+        }
+        return rows.get(row).get(column);
+    }
+
+    @Override
+    public String toString() {
+        return "SimpleTable{" +
+                "title='" + title + '\'' +
+                ", tables=" + tables +
+                '}';
+    }
+
+    /**
+     * 内部迭代器类
+     */
+    private static class TableIterator implements Iterator<List<String>> {
+        private final List<List<String>> tables;
+        private int currentIndex = 0;
+
+        public TableIterator(List<List<String>> tables) {
+            this.tables = tables;
+        }
+
+        @Override
+        public boolean hasNext() {
+            return currentIndex < tables.size();
+        }
+
+        @Override
+        public List<String> next() {
+            return tables.get(currentIndex++);
+        }
+    }
+}

+ 89 - 0
service-daq/src/main/java/com/simuwang/daq/components/CustomExcelMultiSheetListener.java

@@ -0,0 +1,89 @@
+package com.simuwang.daq.components;
+
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.util.StrUtil;
+import com.alibaba.excel.context.AnalysisContext;
+import com.alibaba.excel.event.AnalysisEventListener;
+import com.alibaba.excel.read.metadata.holder.ReadSheetHolder;
+import com.simuwang.base.pojo.dto.report.CustomExcelTable;
+import com.simuwang.base.pojo.dto.report.SimpleTable;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.LinkedHashMap;
+import java.util.List;
+
+/**
+ * @author wangzaijun
+ * @date 2024/10/12 9:17
+ * @description 自定义的excel多sheet解析事件监听器
+ */
+public class CustomExcelMultiSheetListener extends AnalysisEventListener<LinkedHashMap<Integer, Object>> implements Closeable {
+    /**
+     * 所有提取出来的表格
+     */
+    private final List<SimpleTable> tables = ListUtil.list(false);
+    /**
+     * 当前表格(每次标题变更都需要重新赋值)
+     */
+    private SimpleTable table;
+    /**
+     * 当前提取表格的配置(每次标题变更都需要重新赋值)
+     */
+    private CustomExcelTable customExcelTable;
+
+    public List<SimpleTable> getTables() {
+        return tables;
+    }
+
+    @Override
+    public void invoke(LinkedHashMap<Integer, Object> row, AnalysisContext analysisContext) {
+        ReadSheetHolder sheetHolder = analysisContext.readSheetHolder();
+        String sheetName = sheetHolder.getSheetName();
+        if (sheetName.contains("封面") && sheetHolder.getSheetNo() == 0) {
+            return;
+        }
+        @SuppressWarnings("unchecked")
+        List<CustomExcelTable> customExcelTables = (List<CustomExcelTable>) analysisContext.getCustom();
+        // 表格行的第一个位置的字符串
+        String title = ReportParseUtils.cleaningValue(row.get(1));
+        if (title != null) {
+            // 判断是否标题行:如果是标题行或者是另外一个表格的标题行,则重新声明一个表格并追加到解析结果中
+            for (CustomExcelTable customExcelTable : customExcelTables) {
+                String tableTitles = customExcelTable.getTitles();
+                long containCount = StrUtil.split(tableTitles, ",").stream().filter(title::contains).count();
+                if (title.equals(tableTitles) || title.contains(tableTitles) || containCount > 0) {
+                    this.table = new SimpleTable(customExcelTable.getKey(), title, customExcelTable.getColCount());
+                    this.customExcelTable = customExcelTable;
+                    this.tables.add(this.table);
+                    return;
+                }
+            }
+        }
+        // 表格不为空 并且 识别的表格行数比配置的表格行数小,则把当前行追加到表格的行中
+        if (this.table != null && this.table.getRowCount() < this.customExcelTable.getRowCount()) {
+            List<String> tableRow = ListUtil.list(true);
+            int colCount = this.table.getColCount() <= 0 ? row.size() : this.table.getColCount() + this.customExcelTable.getStartCol();
+            for (int i = this.customExcelTable.getStartCol(); i < colCount; i++) {
+                String cell = ReportParseUtils.cleaningValue(row.get(i), false);
+                if (cell != null) {
+                    cell = cell.replace(" ", "")
+                            .replace("(", "(").replace(")", ")")
+                            .replace(":", ":").replace(";", ";");
+                }
+                tableRow.add(cell);
+            }
+            this.table.addRow(tableRow);
+        }
+    }
+
+    @Override
+    public void doAfterAllAnalysed(AnalysisContext analysisContext) {
+
+    }
+
+    @Override
+    public void close() throws IOException {
+        this.tables.clear();
+    }
+}

+ 294 - 0
service-daq/src/main/java/com/simuwang/daq/components/ReportParseUtils.java

@@ -0,0 +1,294 @@
+package com.simuwang.daq.components;
+
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.StrUtil;
+
+import java.util.Calendar;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+public final class ReportParseUtils {
+    /**
+     * 行业配置的表格列名称
+     */
+    public static final List<String> INDUSTRY_COLUMN_NAMES = ListUtil.list(false);
+    /**
+     * 份额变动的表格列名称
+     */
+    public static final List<String> SHARE_CHANGE_COLUMN_NAMES = ListUtil.list(false);
+    /**
+     * 主要财务指标识别列名称
+     */
+    public static final List<String> FINANCIAL_INDICATORS_COLUMN_NAMES = ListUtil.list(false);
+    /**
+     * 资产配置明细和大类关系映射
+     */
+    public static final Map<String, String> ASSET_ALLOCATION_TYPE_MAPPER = MapUtil.newHashMap(32, true);
+
+    static {
+        // 财务指标
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("报告期期末单位净值");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期利润");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期已实现收益");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配利润");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配基金份额利润");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("基金份额累计净值增长率");
+
+        // 中国证监会行业标准
+        INDUSTRY_COLUMN_NAMES.add("农、林、牧、渔业");
+        INDUSTRY_COLUMN_NAMES.add("采矿业");
+        INDUSTRY_COLUMN_NAMES.add("制造业");
+        INDUSTRY_COLUMN_NAMES.add("电力、热力、燃气及水生产和供应业");
+        INDUSTRY_COLUMN_NAMES.add("建筑业");
+        INDUSTRY_COLUMN_NAMES.add("批发和零售业");
+        INDUSTRY_COLUMN_NAMES.add("交通运输、仓储和邮政业");
+        INDUSTRY_COLUMN_NAMES.add("住宿和餐饮业");
+        INDUSTRY_COLUMN_NAMES.add("信息传输、软件和信息技术服务业");
+        INDUSTRY_COLUMN_NAMES.add("金融业");
+        INDUSTRY_COLUMN_NAMES.add("房地产业");
+        INDUSTRY_COLUMN_NAMES.add("租赁和商务服务业");
+        INDUSTRY_COLUMN_NAMES.add("科学研究和技术服务业");
+        INDUSTRY_COLUMN_NAMES.add("水利、环境和公共设施管理业");
+        INDUSTRY_COLUMN_NAMES.add("居民服务、修理和其他服务业");
+        INDUSTRY_COLUMN_NAMES.add("教育");
+        INDUSTRY_COLUMN_NAMES.add("卫生和社会工作");
+        INDUSTRY_COLUMN_NAMES.add("文化、体育和娱乐业");
+        INDUSTRY_COLUMN_NAMES.add("综合");
+
+        INDUSTRY_COLUMN_NAMES.add("港股通");
+
+        // 以下为国际标准
+        INDUSTRY_COLUMN_NAMES.add("能源");
+        INDUSTRY_COLUMN_NAMES.add("原材料");
+        INDUSTRY_COLUMN_NAMES.add("工业");
+        INDUSTRY_COLUMN_NAMES.add("非日常生活消费品");
+        INDUSTRY_COLUMN_NAMES.add("日常消费品");
+        INDUSTRY_COLUMN_NAMES.add("医疗保健");
+        INDUSTRY_COLUMN_NAMES.add("金融");
+        INDUSTRY_COLUMN_NAMES.add("信息技术");
+        INDUSTRY_COLUMN_NAMES.add("通讯服务");
+        INDUSTRY_COLUMN_NAMES.add("公用事业");
+        INDUSTRY_COLUMN_NAMES.add("房地产");
+
+        // 份额变动表格识别列
+        SHARE_CHANGE_COLUMN_NAMES.add("报告期期初基金份额总额");
+        SHARE_CHANGE_COLUMN_NAMES.add("减:报告期期间基金总赎回份额");
+        SHARE_CHANGE_COLUMN_NAMES.add("期末基金总份额/期末基金实缴总额");
+        SHARE_CHANGE_COLUMN_NAMES.add("报告期期间基金拆分变动份额");
+        SHARE_CHANGE_COLUMN_NAMES.add("报告期期间基金总申购份额");
+
+        // 资产配置
+        ASSET_ALLOCATION_TYPE_MAPPER.put("银行存款", "现金类资产");
+        // 境内未上市、未挂牌公司股权投资
+        ASSET_ALLOCATION_TYPE_MAPPER.put("股权投资", "境内未上市、未挂牌公司股权投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:优先股", "境内未上市、未挂牌公司股权投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其他股权类投资", "境内未上市、未挂牌公司股权投资");
+        // 上市公司定向增发投资
+        ASSET_ALLOCATION_TYPE_MAPPER.put("上市公司定向增发股票投资", "上市公司定向增发投资");
+        // 新三板投资
+        ASSET_ALLOCATION_TYPE_MAPPER.put("新三板挂牌企业投资", "新三板投资");
+        // 境内证券投资规模
+        ASSET_ALLOCATION_TYPE_MAPPER.put("结算备付金", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("存出保证金", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("股票投资", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("债券投资", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:银行间市场债券", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:利率债", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:信用债", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("资产支持证券", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("基金投资(公募基金)", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:货币基金", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("期货及衍生品交易保证金", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("买入返售金融资产", "境内证券投资规模");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其他证券类标的", "境内证券投资规模");
+        // 资管计划投资
+        ASSET_ALLOCATION_TYPE_MAPPER.put("商业银行理财产品投资", "资管计划投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("信托计划投资", "资管计划投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("基金公司及其子公司资产管理计划投资", "资管计划投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("保险资产管理计划投资", "资管计划投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("证券公司及其子公司资产管理计划投资", "资管计划投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("期货公司及其子公司资产管理计划投资", "资管计划投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("私募基金产品投资", "资管计划投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("未在协会备案的合伙企业份额", "资管计划投资");
+        // 另类投资
+        ASSET_ALLOCATION_TYPE_MAPPER.put("另类投资", "另类投资");
+        // 境内债权类投资
+        ASSET_ALLOCATION_TYPE_MAPPER.put("银行委托贷款规模", "境内债权类投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("信托贷款", "境内债权类投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("应收账款投资", "境内债权类投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("各类受(收)益权投资", "境内债权类投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("票据(承兑汇票等)投资", "境内债权类投资");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其他债权投资", "境内债权类投资");
+        // 境外投资
+        ASSET_ALLOCATION_TYPE_MAPPER.put("境外投资", "境外投资");
+        // 其他资产
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其他资产", "其他资产");
+        // 基金负债情况
+        ASSET_ALLOCATION_TYPE_MAPPER.put("债券回购总额", "基金负债情况");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("融资、融券总额", "基金负债情况");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:融券总额", "基金负债情况");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("银行借款总额", "基金负债情况");
+        ASSET_ALLOCATION_TYPE_MAPPER.put("其他融资总额", "基金负债情况");
+    }
+
+    /**
+     * 数据清洗,替换圆括号,包含中文或英文的圆括号
+     *
+     * @param value /
+     * @return /
+     */
+    public static String cleaningValue(Object value) {
+        return cleaningValue(value, true);
+    }
+
+    /**
+     * 数据简单清洗,并全部转为字符串类型
+     *
+     * @param value              待清洗的数据
+     * @param replaceParentheses 是否替换圆括号
+     * @return /
+     */
+    public static String cleaningValue(Object value, boolean replaceParentheses) {
+        String fieldValue = StrUtil.toStringOrNull(value);
+        if (!StrUtil.isNullOrUndefined(fieldValue)) {
+            // 特殊字符替换,空格替换为空字符
+            fieldValue = fieldValue
+                    .replace("\r", StrUtil.EMPTY)
+                    .replace(";", ";")
+                    .replaceAll(" ", StrUtil.EMPTY);
+            if (replaceParentheses) {
+                // 正则表达式匹配中文括号及其内容,并替换为空字符串
+                fieldValue = Pattern.compile("[(|(][^)]*[)|)]").matcher(fieldValue).replaceAll(StrUtil.EMPTY);
+            }
+        }
+        // 如果仅有 “-” 该字段值为null
+        if (Objects.equals("-", fieldValue)) {
+            fieldValue = null;
+        }
+        return StrUtil.isBlank(fieldValue) ? null : fieldValue;
+    }
+
+    /**
+     * 匹配分级基金名称(并且把母基金追加到第一行)
+     *
+     * @param text 文本内容
+     * @return /
+     */
+    public static List<String> matchTieredFund(String text) {
+        List<String> matches = ListUtil.list(false);
+        if (StrUtil.isBlank(text)) {
+            matches.add("母基金");
+            return matches;
+        }
+        // 使用正则表达式查找匹配项
+        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
+        Matcher matcher = pattern.matcher(text);
+        // 收集所有匹配项
+        while (matcher.find()) {
+            matches.add(matcher.group());
+        }
+        // 提取字母并按字母顺序排序
+        List<String> levels = matches.stream()
+                .map(s -> s.replaceAll("[^A-F]", ""))
+                .distinct()
+                .sorted()
+                .map(letter -> letter + "级")
+                .collect(Collectors.toList());
+        levels.add(0, "母基金");
+        return levels;
+    }
+
+    /**
+     * 匹配报告日期
+     *
+     * @param string 文本内容
+     * @return 报告日期
+     */
+    public static String matchReportDate(String string) {
+        if (string == null) {
+            return null;
+        }
+        // 编译正则表达式模式
+        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
+        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
+        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
+        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
+        Pattern pat5 = Pattern.compile("\\d{4}\\d{2}\\d{2}");  // 20231231
+        Pattern pat6 = Pattern.compile("(2\\d{3})年度");  // 2023年度
+        // 创建Matcher对象
+        Matcher matcher1 = pat1.matcher(string);
+        Matcher matcher2 = pat2.matcher(string);
+        Matcher matcher3 = pat3.matcher(string);
+        Matcher matcher4 = pat4.matcher(string);
+        Matcher matcher5 = pat5.matcher(string);
+        Matcher matcher6 = pat6.matcher(string);
+        // 尝试匹配
+        if (matcher1.find()) {
+            String year = matcher1.group(1);
+            String quarter = matcher1.group(2);
+            return switch (quarter) {
+                case "一", "1" -> year + "-03-31";
+                case "二", "2" -> year + "-06-30";
+                case "三", "3" -> year + "-09-30";
+                case "四", "4" -> year + "-12-31";
+                default -> null;
+            };
+        } else if (matcher2.find()) {
+            return matcher2.group();
+        } else if (matcher5.find()) {
+            return matcher5.group();
+        } else if (matcher3.find()) {
+            return matcher3.group(1) + "-12-31";
+        } else if (matcher6.find()) {
+            return matcher6.group(1) + "-12-31";
+        } else if (matcher4.find()) {
+            String year = matcher4.group(1);
+            String month = matcher4.group(2);
+            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
+            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
+        } else {
+            return null;
+        }
+    }
+
+    /**
+     * 匹配报告类型,如“季度”、“年度”
+     *
+     * @param string 输入字符串
+     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
+     */
+    public static String matchReportType(String string) {
+        if (string == null) {
+            return null;
+        }
+        // 所有报告的正则识别方式
+        String patterns = "年度|年报|季度|季报|季|月度|月报|月|年";
+        // 编译正则表达式模式
+        Pattern pattern = Pattern.compile(patterns);
+        // 创建Matcher对象
+        Matcher matcher = pattern.matcher(string);
+        // 尝试匹配
+        if (matcher.find()) {
+            return matcher.group();
+        } else {
+            return null;
+        }
+    }
+
+    private static int getLastDayOfMonth(int year, int month) {
+        Calendar calendar = Calendar.getInstance();
+        calendar.set(Calendar.YEAR, year);
+        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
+        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
+    }
+
+    private static String padZero(String number) {
+        return String.format("%02d", Integer.parseInt(number));
+    }
+}

+ 36 - 29
service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java

@@ -6,14 +6,16 @@ import cn.hutool.core.util.ReflectUtil;
 import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
+import com.simuwang.base.pojo.dto.report.BaseReportDTO;
+import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
 import com.simuwang.base.pojo.dto.report.ReportData;
+import com.simuwang.base.pojo.dto.report.ReportParserParams;
+import com.simuwang.daq.components.ReportParseUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
-import java.util.regex.Pattern;
 
 /**
  * @author wangzaijun
@@ -60,6 +62,26 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
     protected abstract void cleaningReportData(T reportData);
 
     /**
+     * 构建只有两列表格的dto数据对象
+     *
+     * @param <DTO>   泛型对象
+     * @param fileId  文件id
+     * @param clazz   泛型对象
+     * @param infoMap 表格转换的函数
+     * @return /
+     */
+    protected <DTO extends BaseReportDTO<?>> DTO buildDto(Integer fileId, Class<DTO> clazz, Map<String, Object> infoMap) {
+        try {
+            DTO dto = clazz.getDeclaredConstructor().newInstance();
+            dto.setFileId(fileId);
+            this.buildInfo(infoMap, dto);
+            return dto;
+        } catch (Exception ignored) {
+        }
+        return null;
+    }
+
+    /**
      * 对象字段设置
      *
      * @param extInfoMap 名称与值的对应关系
@@ -70,8 +92,8 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
             return;
         }
         for (Map.Entry<String, Object> entry : extInfoMap.entrySet()) {
-            String k = this.cleaningValue(entry.getKey());
-            String fieldValue = this.cleaningValue(entry.getValue());
+            String k = ReportParseUtils.cleaningValue(entry.getKey());
+            String fieldValue = ReportParseUtils.cleaningValue(entry.getValue());
             String fieldName = this.fieldMapper.get(k);
             if (StrUtil.isBlank(fieldName)) {
                 continue;
@@ -84,34 +106,19 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
         }
     }
 
-    protected String cleaningValue(Object value) {
-        return this.cleaningValue(value, true);
-    }
-
     /**
-     * 数据简单清洗,并全部转为字符串类型
+     * 构建报告基本信息
      *
-     * @param value              待清洗的数据
-     * @param replaceParentheses 是否替换圆括号
+     * @param params /
      * @return /
      */
-    protected String cleaningValue(Object value, boolean replaceParentheses) {
-        String fieldValue = StrUtil.toStringOrNull(value);
-        if (!StrUtil.isNullOrUndefined(fieldValue)) {
-            // 特殊字符替换,空格替换为空字符
-            fieldValue = fieldValue
-                    .replace("\r", StrUtil.EMPTY)
-                    .replace(";", ";")
-                    .replaceAll(" ", StrUtil.EMPTY);
-            if (replaceParentheses) {
-                // 正则表达式匹配中文括号及其内容,并替换为空字符串
-                fieldValue = Pattern.compile("[(|(][^)]*[)|)]").matcher(fieldValue).replaceAll(StrUtil.EMPTY);
-            }
-        }
-        // 如果仅有 “-” 该字段值为null
-        if (Objects.equals("-", fieldValue)) {
-            fieldValue = null;
-        }
-        return StrUtil.isBlank(fieldValue) ? null : fieldValue;
+    protected ReportBaseInfoDTO buildReportInfo(ReportParserParams params) {
+        Integer fileId = params.getFileId();
+        String reportName = params.getFilename();
+        ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO(fileId);
+        reportInfo.setReportName(reportName);
+        reportInfo.setReportType(ReportParseUtils.matchReportType(reportName));
+        reportInfo.setReportDate(ReportParseUtils.matchReportDate(reportName));
+        return reportInfo;
     }
 }

+ 12 - 24
service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParserConstant.java

@@ -15,53 +15,41 @@ public final class ReportParserConstant {
     public static final Map<ReportType, Map<ReportParserFileType, String>> REPORT_PARSER_BEAN_MAP = MapUtil.newHashMap(8);
 
     public static final String PARSER_PDF_MONTHLY = "report-parser:pdf:monthly";
-    public static final String PARSER_DOC_MONTHLY = "report-parser:doc:monthly";
-    public static final String PARSER_DOCX_MONTHLY = "report-parser:docx:monthly";
-    public static final String PARSER_XLSX_MONTHLY = "report-parser:xlsx:monthly";
-    public static final String PARSER_XLS_MONTHLY = "report-parser:xls:monthly";
+    public static final String PARSER_WORD_MONTHLY = "report-parser:word:monthly";
+    public static final String PARSER_EXCEL_MONTHLY = "report-parser:excel:monthly";
     public static final String PARSER_PYTHON_MONTHLY = "report-parser:python:monthly";
 
     public static final String PARSER_PDF_QUARTERLY = "report-parser:pdf:quarterly";
-    public static final String PARSER_DOC_QUARTERLY = "report-parser:doc:quarterly";
-    public static final String PARSER_DOCX_QUARTERLY = "report-parser:docx:quarterly";
-    public static final String PARSER_XLSX_QUARTERLY = "report-parser:xlsx:quarterly";
-    public static final String PARSER_XLS_QUARTERLY = "report-parser:xls:quarterly";
+    public static final String PARSER_WORD_QUARTERLY = "report-parser:word:quarterly";
+    public static final String PARSER_EXCEL_QUARTERLY = "report-parser:excel:quarterly";
     public static final String PARSER_PYTHON_QUARTERLY = "report-parser:python:quarterly";
 
     public static final String PARSER_PDF_ANNUALLY = "report-parser:pdf:annually";
-    public static final String PARSER_DOC_ANNUALLY = "report-parser:doc:annually";
-    public static final String PARSER_DOCX_ANNUALLY = "report-parser:docx:annually";
-    public static final String PARSER_XLSX_ANNUALLY = "report-parser:xlsx:annually";
-    public static final String PARSER_XLS_ANNUALLY = "report-parser:xls:annually";
+    public static final String PARSER_WORD_ANNUALLY = "report-parser:word:annually";
+    public static final String PARSER_EXCEL_ANNUALLY = "report-parser:excel:annually";
     public static final String PARSER_PYTHON_ANNUALLY = "report-parser:python:annually";
 
     static {
         REPORT_PARSER_BEAN_MAP.put(ReportType.MONTHLY,
                 Map.of(ReportParserFileType.PDF, PARSER_PDF_MONTHLY,
-                        ReportParserFileType.DOC, PARSER_DOC_MONTHLY,
-                        ReportParserFileType.DOCX, PARSER_DOCX_MONTHLY,
-                        ReportParserFileType.XLSX, PARSER_XLSX_MONTHLY,
-                        ReportParserFileType.XLS, PARSER_XLS_MONTHLY,
+                        ReportParserFileType.WORD, PARSER_WORD_MONTHLY,
+                        ReportParserFileType.EXCEL, PARSER_EXCEL_MONTHLY,
 
                         ReportParserFileType.PYTHON, PARSER_PYTHON_MONTHLY
                 ));
 
         REPORT_PARSER_BEAN_MAP.put(ReportType.QUARTERLY,
                 Map.of(ReportParserFileType.PDF, PARSER_PDF_QUARTERLY,
-                        ReportParserFileType.DOC, PARSER_DOC_QUARTERLY,
-                        ReportParserFileType.DOCX, PARSER_DOCX_QUARTERLY,
-                        ReportParserFileType.XLSX, PARSER_XLSX_QUARTERLY,
-                        ReportParserFileType.XLS, PARSER_XLS_QUARTERLY,
+                        ReportParserFileType.WORD, PARSER_WORD_QUARTERLY,
+                        ReportParserFileType.EXCEL, PARSER_EXCEL_QUARTERLY,
 
                         ReportParserFileType.PYTHON, PARSER_PYTHON_QUARTERLY
                 ));
 
         REPORT_PARSER_BEAN_MAP.put(ReportType.ANNUALLY,
                 Map.of(ReportParserFileType.PDF, PARSER_PDF_ANNUALLY,
-                        ReportParserFileType.DOC, PARSER_DOC_ANNUALLY,
-                        ReportParserFileType.DOCX, PARSER_DOCX_ANNUALLY,
-                        ReportParserFileType.XLSX, PARSER_XLSX_ANNUALLY,
-                        ReportParserFileType.XLS, PARSER_XLS_ANNUALLY,
+                        ReportParserFileType.WORD, PARSER_WORD_ANNUALLY,
+                        ReportParserFileType.EXCEL, PARSER_EXCEL_ANNUALLY,
 
                         ReportParserFileType.PYTHON, PARSER_PYTHON_ANNUALLY
                 ));

+ 125 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/AbstractExcelReportParser.java

@@ -0,0 +1,125 @@
+package com.simuwang.daq.components.report.parser.excel;
+
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.exceptions.ExceptionUtil;
+import cn.hutool.core.util.StrUtil;
+import com.alibaba.excel.EasyExcel;
+import com.alibaba.excel.read.builder.ExcelReaderBuilder;
+import com.simuwang.base.common.exception.ReportParseException;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.CustomExcelMultiSheetListener;
+import com.simuwang.daq.components.ReportParseUtils;
+import com.simuwang.daq.components.report.parser.AbstractReportParser;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+public abstract class AbstractExcelReportParser<T extends ReportData> extends AbstractReportParser<T> {
+    public AbstractExcelReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public T parse(ReportParserParams params) throws IOException, ReportParseException {
+        String filename = params.getFilename();
+        this.init();
+        // 配置excel解析
+        CustomExcelMultiSheetListener excelListener = new CustomExcelMultiSheetListener();
+        try {
+            ExcelReaderBuilder readerBuilder = EasyExcel.read(params.getFilepath());
+            readerBuilder.sheet();
+            // 没有表头
+            readerBuilder.headRowNumber(0);
+            // 自定义的对象,传递到事件监听器中
+            readerBuilder.customObject(this.customExcelTables());
+            readerBuilder.registerReadListener(excelListener);
+            readerBuilder.doReadAll();
+        } catch (Exception e) {
+            this.logger.warn("报告解析错误:{}", ExceptionUtil.stacktraceToString(e));
+            throw new ReportParseException(ReportParseStatus.NOT_A_FIXED_FORMAT, filename);
+        }
+        try {
+            // 从excel中提取的表格信息
+            List<SimpleTable> tables = excelListener.getTables();
+            // 报告基本信息
+            ReportBaseInfoDTO reportInfo = this.buildReportInfo(params);
+            // 解析报告中主体基金的基本信息
+            ReportFundInfoDTO reportFundInfo = this.buildFundInfo(params, tables);
+            // 解析其他表格信息并且设置结果字段
+            T reportData = this.parseExtInfoAndSetData(reportInfo, reportFundInfo, tables);
+            // 数据清洗后返回
+            this.cleaningReportData(reportData);
+            return reportData;
+        } catch (ReportParseException e) {
+            throw e;
+        } catch (Exception e) {
+            this.logger.warn("报告解析错误:{}", ExceptionUtil.stacktraceToString(e));
+            throw new ReportParseException(ReportParseStatus.NOT_A_FIXED_FORMAT, filename);
+        }
+    }
+
+    @Override
+    protected void cleaningReportData(T reportData) {
+        // cleaning.
+    }
+
+    /**
+     * 每个子类都有自己的表格解析配置
+     *
+     * @return /
+     */
+    protected abstract List<CustomExcelTable> customExcelTables();
+
+    /**
+     * 解析其他数据并设置到报告结果对象中
+     *
+     * @param reportInfo     报告基本信息
+     * @param reportFundInfo 报告基金基本信息
+     * @param tables         所有解析的表格
+     * @return /
+     */
+    protected abstract T parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO reportFundInfo, List<SimpleTable> tables);
+
+    /**
+     * 解析基金基本信息表格
+     *
+     * @param tables 表格
+     * @return /
+     */
+    protected abstract ReportFundInfoDTO buildFundInfo(ReportParserParams params, List<SimpleTable> tables);
+
+    /**
+     * 构建当前基金和分级基金的数据(分级基金表格可能没有数据)
+     *
+     * @param fileId   文件id
+     * @param tables   数据表
+     * @param clazz    类对象
+     * @param function 映射关系
+     * @param <DTO>    /
+     * @return /
+     */
+    protected <DTO extends BaseReportLevelDTO<?>> List<DTO> buildLevelDto(Integer fileId, List<SimpleTable> tables, Class<DTO> clazz,
+                                                                          Function<SimpleTable, Map<String, Object>> function) {
+        String titles = tables.stream().map(SimpleTable::getTitle).collect(Collectors.joining(","));
+        List<String> levels = ReportParseUtils.matchTieredFund(titles);
+        List<DTO> dtos = ListUtil.list(true);
+        for (int i = 0; i < levels.size(); i++) {
+            String level = levels.get(i);
+            if (i >= tables.size() || StrUtil.isBlank(level)) {
+                continue;
+            }
+            Map<String, Object> infoMap = function.apply(tables.get(i));
+            DTO dto = this.buildDto(fileId, clazz, infoMap);
+            if (dto == null) {
+                continue;
+            }
+            dto.setLevel(level);
+            dtos.add(dto);
+        }
+        return dtos;
+    }
+}

+ 243 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/ExcelAnnuallyReportParser.java

@@ -0,0 +1,243 @@
+package com.simuwang.daq.components.report.parser.excel;
+
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.exception.ReportParseException;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.ReportParseUtils;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+@Component(ReportParserConstant.PARSER_EXCEL_ANNUALLY)
+public class ExcelAnnuallyReportParser extends AbstractExcelReportParser<AnnuallyReportData> {
+    public ExcelAnnuallyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public String getParser() {
+        return ReportParserConstant.PARSER_EXCEL_ANNUALLY;
+    }
+
+    @Override
+    protected List<CustomExcelTable> customExcelTables() {
+        List<CustomExcelTable> customExcelTables = ListUtil.list(true);
+        customExcelTables.add(new CustomExcelTable("fundInfo", "基金基本情况", 2));
+        customExcelTables.add(new CustomExcelTable("fundInfo", "基金产品说明", 2, 4));
+        customExcelTables.add(new CustomExcelTable("financialIndicators", "主要会计数据和财务指标", 4));
+        customExcelTables.add(new CustomExcelTable("financialIndicators", "级基金主要会计数据和财务指标", 4, 10));
+        customExcelTables.add(new CustomExcelTable("assetAllocation", "期末基金资产组合情况", 3));
+        customExcelTables.add(new CustomExcelTable("investmentIndustry", "报告期末按行业分类的境内股票投资组合", 4));
+        customExcelTables.add(new CustomExcelTable("investmentIndustry", "报告期末按行业分类的港股通投资股票投资组合", 3));
+        customExcelTables.add(new CustomExcelTable("shareChange", "基金份额变动情况", 2, 6));
+        customExcelTables.add(new CustomExcelTable("shareChange", "级基金份额变动情况", 2, 6));
+        return customExcelTables;
+    }
+
+    @Override
+    protected AnnuallyReportData parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO reportFundInfo, List<SimpleTable> tables) {
+        Integer fileId = reportInfo.getFileId();
+        String reportName = reportInfo.getReportName();
+        // 主要财务指标
+        List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildFinancialIndicatorsInfo(fileId, reportName, tables);
+        // 资产配置
+        List<ReportAssetAllocationDTO> assetAllocations = this.buildAssetAllocationInfo(fileId, reportName, tables);
+        // 行业配置
+        List<ReportInvestmentIndustryDTO> investmentIndustries = this.buildInvestmentIndustryInfo(fileId, reportName, tables);
+        // 份额变动
+        List<ReportShareChangeDTO> shareChanges = this.buildShareChangeInfo(fileId, reportName, tables);
+        // 构建返回结构
+        AnnuallyReportData reportData = new AnnuallyReportData(reportInfo, reportFundInfo);
+        reportData.setFinancialIndicators(financialIndicators);
+        reportData.setAssetAllocation(assetAllocations);
+        reportData.setInvestmentIndustry(investmentIndustries);
+        reportData.setShareChange(shareChanges);
+        return reportData;
+    }
+
+    @Override
+    protected ReportFundInfoDTO buildFundInfo(ReportParserParams params, List<SimpleTable> tables) {
+        List<SimpleTable> simpleTables = tables.stream().filter(e -> "fundInfo".equals(e.getTableKey())).collect(Collectors.toList());
+        if (CollUtil.isEmpty(simpleTables)) {
+            throw new ReportParseException(ReportParseStatus.PARSE_FUND_INFO_FAIL, params.getFilename());
+        }
+        Map<String, Object> fundInfoMap = MapUtil.newHashMap(32);
+        for (SimpleTable table : simpleTables) {
+            Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
+            for (int i = 0; i < table.getTables().size(); i++) {
+                List<String> cols = table.getTables().get(i);
+                for (int j = 0; j < 1; j++) {
+                    baseInfoMap.put(cols.get(j), cols.get(j + 1));
+                }
+            }
+            fundInfoMap.putAll(baseInfoMap);
+        }
+        ReportFundInfoDTO info = new ReportFundInfoDTO(params.getFileId());
+        this.buildInfo(fundInfoMap, info);
+        return info;
+    }
+
+    /**
+     * 构建基金份额变动信息
+     *
+     * @param fileId   文件id
+     * @param filename 文件名称
+     * @param tables   所有表格
+     * @return /
+     */
+    private List<ReportShareChangeDTO> buildShareChangeInfo(Integer fileId, String filename, List<SimpleTable> tables) {
+        List<SimpleTable> simpleTables = tables.stream().filter(e -> "shareChange".equals(e.getTableKey())).collect(Collectors.toList());
+        if (CollUtil.isEmpty(simpleTables)) {
+            throw new ReportParseException(ReportParseStatus.PARSE_SHARE_INFO_FAIL, filename);
+        }
+        Function<SimpleTable, Map<String, Object>> function = t -> {
+            Map<String, Object> infoMap = MapUtil.newHashMap(16);
+            for (List<String> table : t.getTables()) {
+                String name = table.get(0);
+                if (name == null || !ReportParseUtils.SHARE_CHANGE_COLUMN_NAMES.contains(name)) {
+                    continue;
+                }
+                infoMap.put(name, table.get(1));
+            }
+            return infoMap;
+        };
+        return this.buildLevelDto(fileId, simpleTables, ReportShareChangeDTO.class, function);
+    }
+
+    /**
+     * 构建基金主要财务指标信息
+     *
+     * @param fileId   文件id
+     * @param filename 报告名称
+     * @param tables   所有表格
+     * @return /
+     */
+    private List<ReportFinancialIndicatorsDTO> buildFinancialIndicatorsInfo(Integer fileId, String filename, List<SimpleTable> tables) {
+        List<SimpleTable> simpleTables = tables.stream().filter(e -> "financialIndicators".equals(e.getTableKey())).collect(Collectors.toList());
+        if (CollUtil.isEmpty(simpleTables)) {
+            throw new ReportParseException(ReportParseStatus.PARSE_FINANCIAL_INFO_FAIL, filename);
+        }
+        List<ReportFinancialIndicatorsDTO> dtos = ListUtil.list(false);
+        // 分级基金
+        String titles = simpleTables.stream().map(SimpleTable::getTitle).collect(Collectors.joining(","));
+        List<String> levels = ReportParseUtils.matchTieredFund(titles);
+        for (int k = 0; k < levels.size(); k++) {
+            String level = levels.get(k);
+            if (k >= simpleTables.size() || StrUtil.isBlank(level)) {
+                continue;
+            }
+            SimpleTable table = simpleTables.get(k);
+            int colCount = table.getColCount();
+            for (int j = 1; j < colCount; j++) {
+                Map<String, Object> infoMap = MapUtil.newHashMap(16);
+                String year = ReportParseUtils.cleaningValue(table.getCell(0, j));
+                infoMap.put("年度", year);
+                for (int i = 0; i < table.getRowCount(); i++) {
+                    String columnName = ReportParseUtils.cleaningValue(table.getCell(i, 0));
+                    if (!CollUtil.contains(ReportParseUtils.FINANCIAL_INDICATORS_COLUMN_NAMES, columnName)) {
+                        continue;
+                    }
+                    String value = ReportParseUtils.cleaningValue(table.getCell(i, j));
+                    infoMap.put(columnName, value);
+                }
+                ReportFinancialIndicatorsDTO dto = new ReportFinancialIndicatorsDTO(fileId);
+                this.buildInfo(infoMap, dto);
+                dto.setLevel(levels.get(k));
+                dtos.add(dto);
+            }
+        }
+        return dtos;
+    }
+
+    /**
+     * 构建基金行业配置信息
+     *
+     * @param fileId   文件id
+     * @param filename 报告名称
+     * @param tables   所有表格
+     * @return /
+     */
+    private List<ReportInvestmentIndustryDTO> buildInvestmentIndustryInfo(Integer fileId, String filename, List<SimpleTable> tables) {
+        List<SimpleTable> simpleTables = tables.stream().filter(e -> "investmentIndustry".equals(e.getTableKey())).collect(Collectors.toList());
+        if (CollUtil.isEmpty(simpleTables)) {
+            throw new ReportParseException(ReportParseStatus.PARSE_INDUSTRY_INFO_FAIL, filename);
+        }
+        List<ReportInvestmentIndustryDTO> dtos = ListUtil.list(false);
+        for (SimpleTable table : simpleTables) {
+            int colCount = table.getColCount();
+            // 投资地区: 1-境内, 2-港股通
+            int investType = colCount == 4 ? 1 : 2;
+            int j = colCount == 4 ? 1 : 0;
+            // 按行遍历
+            for (int i = 0; i < table.getRowCount(); i++) {
+                String industryName = ReportParseUtils.cleaningValue(table.getCell(i, j));
+                if (StrUtil.isBlank(industryName) || !ReportParseUtils.INDUSTRY_COLUMN_NAMES.contains(industryName)) {
+                    continue;
+                }
+                ReportInvestmentIndustryDTO dto = new ReportInvestmentIndustryDTO(fileId);
+                dto.setInvestType(investType);
+                dto.setIndustryName(industryName);
+                dto.setMarketValue(ReportParseUtils.cleaningValue(table.getCell(i, j + 1)));
+                dto.setRatio(ReportParseUtils.cleaningValue(table.getCell(i, j + 2)));
+                dtos.add(dto);
+            }
+        }
+        return dtos;
+    }
+
+    /**
+     * 构建基金资产配置信息
+     *
+     * @param fileId   文件id
+     * @param filename 报告名称
+     * @param tables   所有表格
+     * @return /
+     */
+    private List<ReportAssetAllocationDTO> buildAssetAllocationInfo(Integer fileId, String filename, List<SimpleTable> tables) {
+        SimpleTable assetAllocationTable = tables.stream().filter(e -> "assetAllocation".equals(e.getTableKey())).findFirst().orElse(null);
+        if (assetAllocationTable == null) {
+            throw new ReportParseException(ReportParseStatus.PARSE_ASSET_INFO_FAIL, filename);
+        }
+        List<ReportAssetAllocationDTO> dtos = ListUtil.list(false);
+        // 按行遍历
+        for (List<String> row : assetAllocationTable.getTables()) {
+            String marketValueAndRemark = row.get(2);
+            String detail = row.get(1);
+            if (!ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.containsKey(detail)) {
+                continue;
+            }
+            String assetType = ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.get(detail);
+            if (StrUtil.contains(marketValueAndRemark, "#")) {
+                // 有#表示有备注,而且可能有多个,多个用分号分隔的.
+                List<String> marketValueAndRemarks = StrUtil.split(marketValueAndRemark, ";");
+                for (String mr : marketValueAndRemarks) {
+                    if (StrUtil.isBlank(mr)) {
+                        continue;
+                    }
+                    List<String> mrs = StrUtil.split(mr, "#");
+                    ReportAssetAllocationDTO dto = new ReportAssetAllocationDTO(fileId);
+                    dto.setAssetType(assetType);
+                    dto.setAssetDetails(detail);
+                    dto.setMarketValue(mrs.get(1));
+                    dto.setRemark(mrs.get(0));
+                    dtos.add(dto);
+                }
+            } else {
+                ReportAssetAllocationDTO dto = new ReportAssetAllocationDTO(fileId);
+                dto.setAssetType(assetType);
+                dto.setAssetDetails(detail);
+                dto.setMarketValue(marketValueAndRemark);
+                dtos.add(dto);
+            }
+        }
+        return dtos;
+    }
+}

+ 79 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/ExcelMonthlyReportParser.java

@@ -0,0 +1,79 @@
+package com.simuwang.daq.components.report.parser.excel;
+
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
+import com.simuwang.base.common.exception.ReportParseException;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+@Component(ReportParserConstant.PARSER_EXCEL_MONTHLY)
+public class ExcelMonthlyReportParser extends AbstractExcelReportParser<MonthlyReportData> {
+    public ExcelMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public String getParser() {
+        return ReportParserConstant.PARSER_EXCEL_MONTHLY;
+    }
+
+    @Override
+    protected List<CustomExcelTable> customExcelTables() {
+        List<CustomExcelTable> customExcelTables = ListUtil.list(true);
+        customExcelTables.add(new CustomExcelTable("fundInfo", "基金概况", 4));
+        customExcelTables.add(new CustomExcelTable("netReport", "净值月报", 5, 2));
+        customExcelTables.add(new CustomExcelTable("netReport", "级基金净值表", 5, 2));
+        return customExcelTables;
+    }
+
+    @Override
+    protected MonthlyReportData parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO reportFundInfo, List<SimpleTable> tables) {
+        MonthlyReportData reportData = new MonthlyReportData(reportInfo, reportFundInfo);
+        // 取分级基金和当前母基金
+        List<SimpleTable> netNavTables = tables.stream().filter(e -> "netReport".equals(e.getTableKey())).collect(Collectors.toList());
+        // 母基金和分级基金的净值
+        List<ReportNetReportDTO> dtos = this.buildLevelDto(reportInfo.getFileId(), netNavTables,
+                ReportNetReportDTO.class, t -> {
+                    Map<String, Object> extInfoMap = MapUtil.newHashMap(16);
+                    for (int i = 0; i < t.getColCount(); i++) {
+                        String key = t.getCell(0, i);
+                        String value = t.getCell(1, i);
+                        extInfoMap.put(key, value);
+                    }
+                    return extInfoMap;
+                });
+        reportData.setNetReport(dtos);
+        return reportData;
+    }
+
+    @Override
+    protected ReportFundInfoDTO buildFundInfo(ReportParserParams params, List<SimpleTable> tables) {
+        SimpleTable fundInfoTable = tables.stream().filter(e -> "fundInfo".equals(e.getTableKey())).findFirst().orElse(null);
+        if (fundInfoTable == null) {
+            throw new ReportParseException(ReportParseStatus.PARSE_FUND_INFO_FAIL, params.getFilename());
+        }
+        // 月报的基金基本信息是四列的表格
+        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
+        for (int i = 0; i < fundInfoTable.getTables().size(); i++) {
+            @SuppressWarnings("all")
+            List<String> row = fundInfoTable.getTables().get(i);
+            for (int j = 0; j < 2; j++) {
+                baseInfoMap.put(row.get(j * 2), row.get(j * 2 + 1));
+            }
+        }
+        ReportFundInfoDTO dto = new ReportFundInfoDTO(params.getFileId());
+        this.buildInfo(baseInfoMap, dto);
+        return dto;
+    }
+
+    @Override
+    protected void cleaningReportData(MonthlyReportData reportData) {
+        // todo 数据清理
+    }
+}

+ 221 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/excel/ExcelQuarterlyReportParser.java

@@ -0,0 +1,221 @@
+package com.simuwang.daq.components.report.parser.excel;
+
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.exception.ReportParseException;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.ReportParseUtils;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+@Component(ReportParserConstant.PARSER_EXCEL_QUARTERLY)
+public class ExcelQuarterlyReportParser extends AbstractExcelReportParser<QuarterlyReportData> {
+    public ExcelQuarterlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public String getParser() {
+        return ReportParserConstant.PARSER_EXCEL_QUARTERLY;
+    }
+
+    @Override
+    protected List<CustomExcelTable> customExcelTables() {
+        List<CustomExcelTable> customExcelTables = ListUtil.list(true);
+        customExcelTables.add(new CustomExcelTable("fundInfo", "基金基本情况", 2));
+        customExcelTables.add(new CustomExcelTable("financialIndicators", "主要财务指标", 5, 6));
+        customExcelTables.add(new CustomExcelTable("financialIndicators", "级基金主要财务指标", 5, 6));
+        customExcelTables.add(new CustomExcelTable("assetAllocation", "期末基金资产组合情况", 3));
+        customExcelTables.add(new CustomExcelTable("investmentIndustry", "报告期末按行业分类的境内股票投资组合,报告期末按行业分类的股票投资组合", 4));
+        customExcelTables.add(new CustomExcelTable("investmentIndustry", "报告期末按行业分类的港股通投资股票投资组合", 3));
+        customExcelTables.add(new CustomExcelTable("shareChange", "基金份额变动情况", 2, 6));
+        customExcelTables.add(new CustomExcelTable("shareChange", "级基金份额变动情况", 2, 6));
+        return customExcelTables;
+    }
+
+    @Override
+    protected QuarterlyReportData parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO reportFundInfo, List<SimpleTable> tables) {
+        Integer fileId = reportInfo.getFileId();
+        String reportName = reportInfo.getReportName();
+        // 主要财务指标
+        List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildFinancialIndicatorsInfo(fileId, reportName, tables);
+        // 资产配置
+        List<ReportAssetAllocationDTO> assetAllocations = this.buildAssetAllocationInfo(fileId, reportName, tables);
+        // 行业配置
+        List<ReportInvestmentIndustryDTO> investmentIndustries = this.buildInvestmentIndustryInfo(fileId, reportName, tables);
+        // 份额变动
+        List<ReportShareChangeDTO> shareChanges = this.buildShareChangeInfo(fileId, reportName, tables);
+        // 构建返回结构
+        QuarterlyReportData reportData = new QuarterlyReportData(reportInfo, reportFundInfo);
+        reportData.setFinancialIndicators(financialIndicators);
+        reportData.setAssetAllocation(assetAllocations);
+        reportData.setInvestmentIndustry(investmentIndustries);
+        reportData.setShareChange(shareChanges);
+        return reportData;
+    }
+
+    @Override
+    protected ReportFundInfoDTO buildFundInfo(ReportParserParams params, List<SimpleTable> tables) {
+        SimpleTable fundInfoTable = tables.stream().filter(e -> "fundInfo".equals(e.getTableKey())).findFirst().orElse(null);
+        if (fundInfoTable == null) {
+            throw new ReportParseException(ReportParseStatus.PARSE_FUND_INFO_FAIL, params.getFilename());
+        }
+        // 季报和年报的基金基本信息是两列的表格
+        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
+        for (int i = 0; i < fundInfoTable.getTables().size(); i++) {
+            List<String> cols = fundInfoTable.getTables().get(i);
+            for (int j = 0; j < 1; j++) {
+                baseInfoMap.put(cols.get(j), cols.get(j + 1));
+            }
+        }
+        ReportFundInfoDTO dto = new ReportFundInfoDTO(params.getFileId());
+        this.buildInfo(baseInfoMap, dto);
+        return dto;
+    }
+
+    /**
+     * 构建基金份额变动信息
+     *
+     * @param fileId   文件id
+     * @param filename 文件名称
+     * @param tables   所有表格
+     * @return /
+     */
+    private List<ReportShareChangeDTO> buildShareChangeInfo(Integer fileId, String filename, List<SimpleTable> tables) {
+        List<SimpleTable> simpleTables = tables.stream().filter(e -> "shareChange".equals(e.getTableKey())).collect(Collectors.toList());
+        if (CollUtil.isEmpty(simpleTables)) {
+            throw new ReportParseException(ReportParseStatus.PARSE_SHARE_INFO_FAIL, filename);
+        }
+        Function<SimpleTable, Map<String, Object>> function = t -> {
+            Map<String, Object> infoMap = MapUtil.newHashMap(16);
+            for (List<String> table : t.getTables()) {
+                String name = table.get(0);
+                if (name == null || !ReportParseUtils.SHARE_CHANGE_COLUMN_NAMES.contains(name)) {
+                    continue;
+                }
+                infoMap.put(name, table.get(1));
+            }
+            return infoMap;
+        };
+        return this.buildLevelDto(fileId, simpleTables, ReportShareChangeDTO.class, function);
+    }
+
+    /**
+     * 构建基金主要财务指标信息
+     *
+     * @param fileId   文件id
+     * @param filename 报告名称
+     * @param tables   所有表格
+     * @return /
+     */
+    private List<ReportFinancialIndicatorsDTO> buildFinancialIndicatorsInfo(Integer fileId, String filename, List<SimpleTable> tables) {
+        List<SimpleTable> simpleTables = tables.stream().filter(e -> "financialIndicators".equals(e.getTableKey())).collect(Collectors.toList());
+        if (CollUtil.isEmpty(simpleTables)) {
+            throw new ReportParseException(ReportParseStatus.PARSE_FINANCIAL_INFO_FAIL, filename);
+        }
+        Function<SimpleTable, Map<String, Object>> function = t -> {
+            Map<String, Object> infoMap = MapUtil.newHashMap(16);
+            for (List<String> table : t.getTables()) {
+                String name = table.get(0);
+                if (name == null || !ReportParseUtils.FINANCIAL_INDICATORS_COLUMN_NAMES.contains(name)) {
+                    continue;
+                }
+                infoMap.put(name, table.get(1));
+            }
+            return infoMap;
+        };
+        return this.buildLevelDto(fileId, simpleTables, ReportFinancialIndicatorsDTO.class, function);
+    }
+
+    /**
+     * 构建基金行业配置信息
+     *
+     * @param fileId   文件id
+     * @param filename 报告名称
+     * @param tables   所有表格
+     * @return /
+     */
+    private List<ReportInvestmentIndustryDTO> buildInvestmentIndustryInfo(Integer fileId, String filename, List<SimpleTable> tables) {
+        List<SimpleTable> simpleTables = tables.stream().filter(e -> "investmentIndustry".equals(e.getTableKey())).collect(Collectors.toList());
+        if (CollUtil.isEmpty(simpleTables)) {
+            throw new ReportParseException(ReportParseStatus.PARSE_INDUSTRY_INFO_FAIL, filename);
+        }
+        List<ReportInvestmentIndustryDTO> dtos = ListUtil.list(false);
+        for (SimpleTable table : simpleTables) {
+            int colCount = table.getColCount();
+            // 投资地区: 1-境内, 2-港股通
+            int investType = colCount == 4 ? 1 : 2;
+            int j = colCount == 4 ? 1 : 0;
+            // 按行遍历
+            for (int i = 0; i < table.getRowCount(); i++) {
+                String industryName = ReportParseUtils.cleaningValue(table.getCell(i, j));
+                if (StrUtil.isBlank(industryName) || !ReportParseUtils.INDUSTRY_COLUMN_NAMES.contains(industryName)) {
+                    continue;
+                }
+                ReportInvestmentIndustryDTO dto = new ReportInvestmentIndustryDTO(fileId);
+                dto.setInvestType(investType);
+                dto.setIndustryName(industryName);
+                dto.setMarketValue(ReportParseUtils.cleaningValue(table.getCell(i, j + 1)));
+                dto.setRatio(ReportParseUtils.cleaningValue(table.getCell(i, j + 2)));
+                dtos.add(dto);
+            }
+        }
+        return dtos;
+    }
+
+    /**
+     * 构建基金资产配置信息
+     *
+     * @param fileId   文件id
+     * @param filename 报告名称
+     * @param tables   所有表格
+     * @return /
+     */
+    private List<ReportAssetAllocationDTO> buildAssetAllocationInfo(Integer fileId, String filename, List<SimpleTable> tables) {
+        SimpleTable assetAllocationTable = tables.stream().filter(e -> "assetAllocation".equals(e.getTableKey())).findFirst().orElse(null);
+        if (assetAllocationTable == null) {
+            throw new ReportParseException(ReportParseStatus.PARSE_ASSET_INFO_FAIL, filename);
+        }
+        List<ReportAssetAllocationDTO> dtos = ListUtil.list(false);
+        // 按行遍历
+        for (List<String> row : assetAllocationTable.getTables()) {
+            String marketValueAndRemark = row.get(2);
+            String detail = row.get(1);
+            if (!ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.containsKey(detail)) {
+                continue;
+            }
+            String assetType = ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.get(detail);
+            if (StrUtil.contains(marketValueAndRemark, "#")) {
+                // 有#表示有备注,而且可能有多个,多个用分号分隔的.
+                List<String> marketValueAndRemarks = StrUtil.split(marketValueAndRemark, ";");
+                for (String mr : marketValueAndRemarks) {
+                    if (StrUtil.isBlank(mr)) {
+                        continue;
+                    }
+                    List<String> mrs = StrUtil.split(mr, "#");
+                    ReportAssetAllocationDTO dto = new ReportAssetAllocationDTO(fileId);
+                    dto.setAssetType(assetType);
+                    dto.setAssetDetails(detail);
+                    dto.setMarketValue(mrs.get(1));
+                    dto.setRemark(mrs.get(0));
+                    dtos.add(dto);
+                }
+            } else {
+                ReportAssetAllocationDTO dto = new ReportAssetAllocationDTO(fileId);
+                dto.setAssetType(assetType);
+                dto.setAssetDetails(detail);
+                dto.setMarketValue(marketValueAndRemark);
+                dtos.add(dto);
+            }
+        }
+        return dtos;
+    }
+}

+ 2 - 154
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java

@@ -5,11 +5,11 @@ import cn.hutool.core.exceptions.ExceptionUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.common.conts.Constants;
-import com.simuwang.base.common.enums.ReportType;
 import com.simuwang.base.common.exception.ReportParseException;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.*;
 import com.simuwang.daq.components.CustomPDFTextStripper;
+import com.simuwang.daq.components.ReportParseUtils;
 import com.simuwang.daq.components.report.parser.AbstractReportParser;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
@@ -21,14 +21,10 @@ import technology.tabula.Table;
 import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
 
 import java.io.IOException;
-import java.util.Calendar;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.function.Function;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
 
 /**
  * @author wangzaijun
@@ -146,22 +142,6 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
     }
 
     /**
-     * 构建报告基本信息
-     *
-     * @param params /
-     * @return /
-     */
-    private ReportBaseInfoDTO buildReportInfo(ReportParserParams params) {
-        Integer fileId = params.getFileId();
-        String reportName = params.getFilename();
-        ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO(fileId);
-        reportInfo.setReportName(reportName);
-        reportInfo.setReportType(this.matchReportType(reportName));
-        reportInfo.setReportDate(this.matchReportDate(reportName));
-        return reportInfo;
-    }
-
-    /**
      * 构建只有两列表格的dto数据对象,如果有分级基金时(并且一个表格可能跨页)
      *
      * @param <DTO>    泛型对象
@@ -195,8 +175,7 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
             }
         }
         // 分级基金匹配
-        List<String> levels = this.matchTieredFund(String.join(",", this.textList));
-        levels.add(0, "母基金");
+        List<String> levels = ReportParseUtils.matchTieredFund(String.join(",", this.textList));
         for (int i = 0; i < infos.size(); i++) {
             DTO dto = this.buildDto(fileId, clazz, infos.get(i));
             if (dto == null) {
@@ -207,135 +186,4 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         }
         return dtos;
     }
-
-    /**
-     * 构建只有两列表格的dto数据对象
-     *
-     * @param <DTO>   泛型对象
-     * @param fileId  文件id
-     * @param clazz   泛型对象
-     * @param infoMap 表格转换的函数
-     * @return /
-     */
-    private <DTO extends BaseReportDTO<?>> DTO buildDto(Integer fileId, Class<DTO> clazz, Map<String, Object> infoMap) {
-        try {
-            DTO dto = clazz.getDeclaredConstructor().newInstance();
-            dto.setFileId(fileId);
-            this.buildInfo(infoMap, dto);
-            return dto;
-        } catch (Exception ignored) {
-        }
-        return null;
-    }
-
-    /**
-     * 匹配分级基金名称
-     *
-     * @param text 文本内容
-     * @return /
-     */
-    protected List<String> matchTieredFund(String text) {
-        List<String> matches = ListUtil.list(false);
-        if (StrUtil.isBlank(text)) {
-            return matches;
-        }
-        // 使用正则表达式查找匹配项
-        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
-        Matcher matcher = pattern.matcher(text);
-        // 收集所有匹配项
-        while (matcher.find()) {
-            matches.add(matcher.group());
-        }
-        // 提取字母并按字母顺序排序
-        return matches.stream()
-                .map(s -> s.replaceAll("[^A-F]", ""))
-                .distinct()
-                .sorted()
-                .map(letter -> letter + "级")
-                .collect(Collectors.toList());
-    }
-
-    /**
-     * 匹配报告日期
-     *
-     * @param string 文本内容
-     * @return 报告日期
-     */
-    private String matchReportDate(String string) {
-        if (string == null) {
-            return null;
-        }
-        // 编译正则表达式模式
-        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
-        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
-        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
-        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
-        Pattern pat5 = Pattern.compile("\\d{4}\\d{2}\\d{2}");  // 20231231
-        Pattern pat6 = Pattern.compile("(2\\d{3})年度");  // 2023年度
-        // 创建Matcher对象
-        Matcher matcher1 = pat1.matcher(string);
-        Matcher matcher2 = pat2.matcher(string);
-        Matcher matcher3 = pat3.matcher(string);
-        Matcher matcher4 = pat4.matcher(string);
-        Matcher matcher5 = pat5.matcher(string);
-        Matcher matcher6 = pat6.matcher(string);
-        // 尝试匹配
-        if (matcher1.find()) {
-            String year = matcher1.group(1);
-            String quarter = matcher1.group(2);
-            return switch (quarter) {
-                case "一", "1" -> year + "-03-31";
-                case "二", "2" -> year + "-06-30";
-                case "三", "3" -> year + "-09-30";
-                case "四", "4" -> year + "-12-31";
-                default -> null;
-            };
-        } else if (matcher2.find()) {
-            return matcher2.group();
-        } else if (matcher5.find()) {
-            return matcher5.group();
-        } else if (matcher3.find()) {
-            return matcher3.group(1) + "-12-31";
-        } else if (matcher6.find()) {
-            return matcher6.group(1) + "-12-31";
-        } else if (matcher4.find()) {
-            String year = matcher4.group(1);
-            String month = matcher4.group(2);
-            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
-            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
-        } else {
-            return null;
-        }
-    }
-
-    /**
-     * 匹配报告类型,如“季度”、“年度”
-     *
-     * @param string 输入字符串
-     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
-     */
-    private String matchReportType(String string) {
-        if (string == null) {
-            return null;
-        }
-        // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
-        ReportType reportType = ReportType.MONTHLY;
-        if (StrUtil.containsAny(string, ReportType.QUARTERLY.getPatterns())) {
-            reportType = ReportType.QUARTERLY;
-        } else if (StrUtil.containsAny(string, ReportType.ANNUALLY.getPatterns())) {
-            reportType = ReportType.ANNUALLY;
-        }
-        return reportType.getLabel();
-    }
-
-    private int getLastDayOfMonth(int year, int month) {
-        Calendar calendar = Calendar.getInstance();
-        calendar.set(Calendar.YEAR, year);
-        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
-        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
-    }
-
-    private String padZero(String number) {
-        return String.format("%02d", Integer.parseInt(number));
-    }
 }

+ 11 - 11
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java

@@ -5,6 +5,7 @@ import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.map.MapUtil;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.ReportParseUtils;
 import com.simuwang.daq.components.report.parser.ReportParserConstant;
 import org.springframework.stereotype.Component;
 import technology.tabula.Table;
@@ -48,31 +49,31 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
             }
             // 用表格的第一列的数据判断是否主要财务指标数据
             List<String> texts = this.getTableColTexts(table, 0);
-            if (CollUtil.containsAny(texts, FINANCIAL_INDICATORS_COLUMN_NAMES)) {
+            if (CollUtil.containsAny(texts, ReportParseUtils.FINANCIAL_INDICATORS_COLUMN_NAMES)) {
                 this.financialIndicatorsTables.add(table);
                 continue;
             }
             int colCount = table.getColCount();
             if (colCount == 2) {
                 // 用表格的第一列的数据判断是否份额变动记录
-                if (CollUtil.containsAny(texts, SHARE_CHANGE_COLUMN_NAMES)) {
+                if (CollUtil.containsAny(texts, ReportParseUtils.SHARE_CHANGE_COLUMN_NAMES)) {
                     this.shareChangeTables.add(table);
                 }
             } else if (colCount == 4) {
                 // 用表格的第二列的数据判断是否行业配置数据(内地)
                 texts = this.getTableColTexts(table, 1);
-                if (CollUtil.containsAny(texts, INDUSTRY_COLUMN_NAMES)) {
+                if (CollUtil.containsAny(texts, ReportParseUtils.INDUSTRY_COLUMN_NAMES)) {
                     this.investmentIndustryTables.add(table);
                 }
             } else if (colCount == 3) {
                 // 用表格的第一列的数据判断是否行业配置数据(港股通)
-                if (CollUtil.containsAny(texts, INDUSTRY_COLUMN_NAMES)) {
+                if (CollUtil.containsAny(texts, ReportParseUtils.INDUSTRY_COLUMN_NAMES)) {
                     this.investmentIndustryTables.add(table);
                     continue;
                 }
                 // 资产配置表格识别(兼容跨页的表格)获取表格中第二列的所有文字,判断所有文字中包含"股权投资"等字符串
                 texts = this.getTableColTexts(table, 1);
-                Set<String> keys = ASSET_ALLOCATION_TYPE_MAPPER.keySet();
+                Set<String> keys = ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.keySet();
                 if (CollUtil.containsAny(texts, keys)) {
                     this.assetAllocationTables.add(table);
                 }
@@ -112,22 +113,21 @@ public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyRepo
     protected List<ReportFinancialIndicatorsDTO> buildFinancialIndicatorsInfo(Integer fileId, Function<Table, Map<String, Object>> function) {
         List<ReportFinancialIndicatorsDTO> dtos = ListUtil.list(false);
         // 分级基金
-        List<String> levels = this.matchTieredFund(String.join(",", this.textList));
-        levels.add(0, "母基金");
+        List<String> levels = ReportParseUtils.matchTieredFund(String.join(",", this.textList));
         // 假设这里可能存在分级基金,不存在表格跨页
         for (int k = 0; k < this.financialIndicatorsTables.size(); k++) {
             Table table = this.financialIndicatorsTables.get(k);
             int colCount = table.getColCount();
             for (int j = 1; j < colCount; j++) {
                 Map<String, Object> infoMap = MapUtil.newHashMap(16);
-                String year = this.cleaningValue(table.getCell(0, j).getText());
+                String year = ReportParseUtils.cleaningValue(table.getCell(0, j).getText());
                 infoMap.put("年度", year);
                 for (int i = 0; i < table.getRowCount(); i++) {
-                    String columnName = this.cleaningValue(table.getCell(i, 0).getText());
-                    if (!CollUtil.contains(FINANCIAL_INDICATORS_COLUMN_NAMES, columnName)) {
+                    String columnName = ReportParseUtils.cleaningValue(table.getCell(i, 0).getText());
+                    if (!CollUtil.contains(ReportParseUtils.FINANCIAL_INDICATORS_COLUMN_NAMES, columnName)) {
                         continue;
                     }
-                    String value = this.cleaningValue(table.getCell(i, j).getText());
+                    String value = ReportParseUtils.cleaningValue(table.getCell(i, j).getText());
                     infoMap.put(columnName, value);
                 }
                 ReportFinancialIndicatorsDTO dto = new ReportFinancialIndicatorsDTO(fileId);

+ 15 - 127
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java

@@ -6,6 +6,7 @@ import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.ReportParseUtils;
 import com.simuwang.daq.components.report.parser.ReportParserConstant;
 import org.springframework.stereotype.Component;
 import technology.tabula.RectangularTextContainer;
@@ -22,119 +23,6 @@ import java.util.function.Function;
  */
 @Component(ReportParserConstant.PARSER_PDF_QUARTERLY)
 public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends AbstractPDReportParser<T> {
-    protected static final List<String> INDUSTRY_COLUMN_NAMES = ListUtil.list(false);
-    protected static final List<String> SHARE_CHANGE_COLUMN_NAMES = ListUtil.list(false);
-    protected static final List<String> FINANCIAL_INDICATORS_COLUMN_NAMES = ListUtil.list(false);
-    // 资产配置明细和大类关系映射
-    protected static final Map<String, String> ASSET_ALLOCATION_TYPE_MAPPER = MapUtil.newHashMap(32, true);
-
-    static {
-        // 财务指标
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("报告期期末单位净值");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期利润");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期已实现收益");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配利润");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配基金份额利润");
-        FINANCIAL_INDICATORS_COLUMN_NAMES.add("基金份额累计净值增长率");
-
-        // 中国证监会行业标准
-        INDUSTRY_COLUMN_NAMES.add("农、林、牧、渔业");
-        INDUSTRY_COLUMN_NAMES.add("采矿业");
-        INDUSTRY_COLUMN_NAMES.add("制造业");
-        INDUSTRY_COLUMN_NAMES.add("电力、热力、燃气及水生产和供应业");
-        INDUSTRY_COLUMN_NAMES.add("建筑业");
-        INDUSTRY_COLUMN_NAMES.add("批发和零售业");
-        INDUSTRY_COLUMN_NAMES.add("交通运输、仓储和邮政业");
-        INDUSTRY_COLUMN_NAMES.add("住宿和餐饮业");
-        INDUSTRY_COLUMN_NAMES.add("信息传输、软件和信息技术服务业");
-        INDUSTRY_COLUMN_NAMES.add("金融业");
-        INDUSTRY_COLUMN_NAMES.add("房地产业");
-        INDUSTRY_COLUMN_NAMES.add("租赁和商务服务业");
-        INDUSTRY_COLUMN_NAMES.add("科学研究和技术服务业");
-        INDUSTRY_COLUMN_NAMES.add("水利、环境和公共设施管理业");
-        INDUSTRY_COLUMN_NAMES.add("居民服务、修理和其他服务业");
-        INDUSTRY_COLUMN_NAMES.add("教育");
-        INDUSTRY_COLUMN_NAMES.add("卫生和社会工作");
-        INDUSTRY_COLUMN_NAMES.add("文化、体育和娱乐业");
-        INDUSTRY_COLUMN_NAMES.add("综合");
-
-        INDUSTRY_COLUMN_NAMES.add("港股通");
-
-        // 以下为国际标准
-        INDUSTRY_COLUMN_NAMES.add("能源");
-        INDUSTRY_COLUMN_NAMES.add("原材料");
-        INDUSTRY_COLUMN_NAMES.add("工业");
-        INDUSTRY_COLUMN_NAMES.add("非日常生活消费品");
-        INDUSTRY_COLUMN_NAMES.add("日常消费品");
-        INDUSTRY_COLUMN_NAMES.add("医疗保健");
-        INDUSTRY_COLUMN_NAMES.add("金融");
-        INDUSTRY_COLUMN_NAMES.add("信息技术");
-        INDUSTRY_COLUMN_NAMES.add("通讯服务");
-        INDUSTRY_COLUMN_NAMES.add("公用事业");
-        INDUSTRY_COLUMN_NAMES.add("房地产");
-
-        // 份额变动表格识别列
-        SHARE_CHANGE_COLUMN_NAMES.add("报告期期初基金份额总额");
-        SHARE_CHANGE_COLUMN_NAMES.add("减:报告期期间基金总赎回份额");
-        SHARE_CHANGE_COLUMN_NAMES.add("期末基金总份额/期末基金实缴总额");
-        SHARE_CHANGE_COLUMN_NAMES.add("报告期期间基金拆分变动份额");
-        SHARE_CHANGE_COLUMN_NAMES.add("报告期期间基金总申购份额");
-
-        // 资产配置
-        ASSET_ALLOCATION_TYPE_MAPPER.put("银行存款", "现金类资产");
-        // 境内未上市、未挂牌公司股权投资
-        ASSET_ALLOCATION_TYPE_MAPPER.put("股权投资", "境内未上市、未挂牌公司股权投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:优先股", "境内未上市、未挂牌公司股权投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其他股权类投资", "境内未上市、未挂牌公司股权投资");
-        // 上市公司定向增发投资
-        ASSET_ALLOCATION_TYPE_MAPPER.put("上市公司定向增发投资", "上市公司定向增发投资");
-        // 新三板投资
-        ASSET_ALLOCATION_TYPE_MAPPER.put("新三板挂牌企业投资", "新三板投资");
-        // 境内证券投资规模
-        ASSET_ALLOCATION_TYPE_MAPPER.put("结算备付金", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("存出保证金", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("股票投资", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("债券投资", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:银行间市场债券", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:利率债", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:信用债", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("资产支持证券", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("基金投资(公募基金)", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:货币基金", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("期货及衍生品交易保证金", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("买入返售金融资产", "境内证券投资规模");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其他证券类标的", "境内证券投资规模");
-        // 资管计划投资
-        ASSET_ALLOCATION_TYPE_MAPPER.put("商业银行理财产品投资", "资管计划投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("信托计划投资", "资管计划投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("基金公司及其子公司资产管理计划投资", "资管计划投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("保险资产管理计划投资", "资管计划投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("证券公司及其子公司资产管理计划投资", "资管计划投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("期货公司及其子公司资产管理计划投资", "资管计划投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("私募基金产品投资", "资管计划投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("未在协会备案的合伙企业份额", "资管计划投资");
-        // 另类投资
-        ASSET_ALLOCATION_TYPE_MAPPER.put("另类投资", "另类投资");
-        // 境内债权类投资
-        ASSET_ALLOCATION_TYPE_MAPPER.put("银行委托贷款规模", "境内债权类投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("信托贷款", "境内债权类投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("应收账款投资", "境内债权类投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("各类受(收)益权投资", "境内债权类投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("票据(承兑汇票等)投资", "境内债权类投资");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其他债权投资", "境内债权类投资");
-        // 境外投资
-        ASSET_ALLOCATION_TYPE_MAPPER.put("境外投资", "境外投资");
-        // 其他资产
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其他资产", "其他资产");
-        // 基金负债情况
-        ASSET_ALLOCATION_TYPE_MAPPER.put("债券回购总额", "基金负债情况");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("融资、融券总额", "基金负债情况");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其中:融券总额", "基金负债情况");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("银行借款总额", "基金负债情况");
-        ASSET_ALLOCATION_TYPE_MAPPER.put("其他融资总额", "基金负债情况");
-    }
-
     protected List<Table> financialIndicatorsTables;
     protected List<Table> shareChangeTables;
     protected List<Table> assetAllocationTables;
@@ -167,9 +55,9 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
                 // 用表格的第一列的数据判断是否份额变动记录
                 List<String> texts = this.getTableColTexts(table, 0);
                 // 主要财务指标或份额变动
-                if (CollUtil.containsAny(texts, SHARE_CHANGE_COLUMN_NAMES)) {
+                if (CollUtil.containsAny(texts, ReportParseUtils.SHARE_CHANGE_COLUMN_NAMES)) {
                     this.shareChangeTables.add(table);
-                } else if (CollUtil.containsAny(texts, FINANCIAL_INDICATORS_COLUMN_NAMES)) {
+                } else if (CollUtil.containsAny(texts, ReportParseUtils.FINANCIAL_INDICATORS_COLUMN_NAMES)) {
                     this.financialIndicatorsTables.add(table);
                 }
             } else if (colCount == 4) {
@@ -178,11 +66,11 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
             } else if (colCount == 3) {
                 // 用表格的第一列单元格判断是否资产配置表
                 List<String> texts = this.getTableColTexts(table, 0);
-                if (CollUtil.containsAny(texts, INDUSTRY_COLUMN_NAMES)) {
+                if (CollUtil.containsAny(texts, ReportParseUtils.INDUSTRY_COLUMN_NAMES)) {
                     this.investmentIndustryTables.add(table);
                 } else {
                     texts = this.getTableColTexts(table, 1);
-                    Set<String> keys = ASSET_ALLOCATION_TYPE_MAPPER.keySet();
+                    Set<String> keys = ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.keySet();
                     if (CollUtil.containsAny(texts, keys)) {
                         this.assetAllocationTables.add(table);
                     }
@@ -287,19 +175,19 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
             int j = colCount == 4 ? 1 : 0;
             // 按行遍历
             for (int i = 0; i < table.getRowCount(); i++) {
-                String text = this.cleaningValue(table.getCell(i, 0).getText());
+                String text = ReportParseUtils.cleaningValue(table.getCell(i, 0).getText());
                 if (StrUtil.containsAny(text, "序号", "行业类别")) {
                     continue;
                 }
-                String industryName = this.cleaningValue(table.getCell(i, j).getText());
-                if (StrUtil.isBlank(industryName) || Objects.equals("合计", industryName)) {
+                String industryName = ReportParseUtils.cleaningValue(table.getCell(i, j).getText());
+                if (StrUtil.isBlank(industryName) || !ReportParseUtils.INDUSTRY_COLUMN_NAMES.contains(industryName)) {
                     continue;
                 }
                 ReportInvestmentIndustryDTO dto = new ReportInvestmentIndustryDTO(fileId);
                 dto.setInvestType(investType);
                 dto.setIndustryName(industryName);
-                dto.setMarketValue(this.cleaningValue(table.getCell(i, j + 1).getText()));
-                dto.setRatio(this.cleaningValue(table.getCell(i, j + 2).getText()));
+                dto.setMarketValue(ReportParseUtils.cleaningValue(table.getCell(i, j + 1).getText()));
+                dto.setRatio(ReportParseUtils.cleaningValue(table.getCell(i, j + 2).getText()));
                 dtos.add(dto);
             }
         }
@@ -320,14 +208,14 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
                 // x坐标升序(防止部分行乱序问题)
                 row.sort(Comparator.comparing(Rectangle2D.Float::getX));
                 // 金额、市值,有时是 “备注#金额”的格式
-                String marketValueAndRemark = this.cleaningValue(row.get(2).getText());
+                String marketValueAndRemark = ReportParseUtils.cleaningValue(row.get(2).getText());
                 // 资产明细
-                String detail = this.cleaningValue(row.get(1).getText(), false);
-                if (!ASSET_ALLOCATION_TYPE_MAPPER.containsKey(detail)) {
+                String detail = ReportParseUtils.cleaningValue(row.get(1).getText(), false);
+                if (!ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.containsKey(detail)) {
                     continue;
                 }
                 // 大类
-                String assetType = ASSET_ALLOCATION_TYPE_MAPPER.get(detail);
+                String assetType = ReportParseUtils.ASSET_ALLOCATION_TYPE_MAPPER.get(detail);
                 if (StrUtil.contains(marketValueAndRemark, "#")) {
                     // 有#表示有备注,而且可能有多个,多个用分号分隔的.
                     List<String> marketValueAndRemarks = StrUtil.split(marketValueAndRemark, ";");
@@ -365,7 +253,7 @@ public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends Abst
     protected List<String> getTableColTexts(Table table, Integer col) {
         List<String> details = ListUtil.list(false);
         for (@SuppressWarnings("all") List<RectangularTextContainer> row : table.getRows()) {
-            String detail = this.cleaningValue(row.get(col).getText(), false);
+            String detail = ReportParseUtils.cleaningValue(row.get(col).getText(), false);
             if (StrUtil.isNotBlank(detail)) {
                 details.add(detail);
             }

BIN=BIN
service-daq/src/main/java/com/simuwang/daq/utils/12931.pdf


BIN=BIN
service-daq/src/main/java/com/simuwang/daq/utils/2061834.pdf


+ 0 - 370
service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java

@@ -1,370 +0,0 @@
-//package com.simuwang.daq.utils;
-//
-//import cn.hutool.core.collection.ListUtil;
-//import cn.hutool.core.map.MapUtil;
-//import cn.hutool.core.util.ReflectUtil;
-//import cn.hutool.core.util.StrUtil;
-//import cn.hutool.http.HttpUtil;
-//import cn.hutool.json.JSONObject;
-//import cn.hutool.json.JSONUtil;
-//import com.simuwang.base.common.conts.Constants;
-//import com.simuwang.base.pojo.dto.report.PythonResult;
-//import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
-//import com.simuwang.daq.components.CustomPDFTextStripper;
-//import com.simuwang.daq.components.PythonReportConverter;
-//import com.smppw.common.pojo.ValueLabelVO;
-//import org.apache.pdfbox.Loader;
-//import org.apache.pdfbox.cos.COSName;
-//import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
-//import org.apache.pdfbox.pdmodel.PDDocument;
-//import org.apache.pdfbox.pdmodel.PDPage;
-//import org.apache.pdfbox.pdmodel.PDResources;
-//import org.apache.pdfbox.pdmodel.common.PDStream;
-//import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-//import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-//import org.apache.pdfbox.text.PDFTextStripper;
-//import technology.tabula.*;
-//import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
-//
-//import java.io.IOException;
-//import java.util.*;
-//import java.util.regex.Matcher;
-//import java.util.regex.Pattern;
-//import java.util.stream.Collectors;
-//
-//public class ReportParseUtil {
-//    public static void main(String[] args) throws IOException {
-////        String fileName = "SJM970_排排精选进取一号私募证券投资基金_2022年第4季度报告.pdf";
-////        Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
-////        Matcher matcher = pattern.matcher(fileName);
-////        String registerNumber = null;
-////        if (matcher.find()) {
-////            registerNumber = matcher.group();
-////        }
-////
-////        int type = 1;
-////        String baseUrl = "http://192.168.0.81:8088";
-////        String api = "/api/v1/parse/amac_report";
-////        Map<String, Object> params = MapUtil.newHashMap(16);
-////        params.put("file_id", 111112);
-////        params.put("file_path", "E:/workproject/fastparse/src/fastparse/static/reports/quarterly_report/13445.pdf");
-////        params.put("register_number", registerNumber);
-////        params.put("file_type", type);
-////        params.put("file_name", fileName);
-////        params.put("fund_name", null);
-////        params.put("trust_name", null);
-////        String body = HttpUtil.post(baseUrl + api, JSONUtil.toJsonStr(params));
-////        JSONObject obj = JSONUtil.parseObj(body);
-////        PythonResult<?> result = PythonReportConverter.convert(obj, type);
-////        System.out.println(result);
-//
-//        List<ValueLabelVO> fieldMapper = ListUtil.list(false);
-//        fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
-//        fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
-//        fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
-//        fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
-//        fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
-//        fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
-//        fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
-//        fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
-//        fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
-//
-////        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
-////        List<String> watermarks = watermarkMap.get("less");
-//
-////        System.out.println(watermarks);
-////        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
-//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("C:\\Users\\Administrator\\Desktop\\self\\新报告解析\\基协报告\\季报\\SVP311_私募基金季报PDF_国恩回报6号增强私募证券投资基金_2024年06月30日.pdf"))) {
-////            PDFTextStripper stripper = new PDFTextStripper();
-////            stripper.setSortByPosition(true);
-////            String allText = stripper.getText(document);
-////            List<String> textList = StrUtil.split(allText, "\r\n");
-////            System.out.println(textList);
-//
-//            PDFTextStripper textStripper = new CustomPDFTextStripper();
-//            textStripper.setSortByPosition(true);
-//            String text1 = textStripper.getText(document);
-//            text1 = text1.replace(Constants.WATERMARK_REPLACE, Constants.EMPTY);
-//            List<String> textList = StrUtil.split(text1, System.lineSeparator());
-//            textList.removeIf(StrUtil::isBlank);
-//            System.out.println(textList.get(0));
-//
-////            for (PDPage page : document.getPages()) {
-////
-//////                PDResources resources = page.getResources();
-//////                Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
-//////                Iterator<COSName> iterator = resources.getXObjectNames().iterator();
-//////                while (iterator.hasNext()) {
-//////                    COSName next = iterator.next();
-//////                    if (imageXObjectMap.containsKey(next)) {
-//////                        iterator.remove();
-//////                    }
-//////                }
-//////                removeTextWatermark(page);
-////
-////                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
-////                stripper.setSortByPosition(true);
-////                stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
-////                stripper.extractRegions(page);
-////                for (String region : stripper.getRegions()) {
-////                    String text = stripper.getTextForRegion(region);
-////                    String res = processString(watermarks, text);
-////                    System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
-////                }
-////            }
-////            document.save(new File("./1.pdf"));
-//
-//            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
-//            PageIterator pageIterator = new ObjectExtractor(document).extract();
-//            while (pageIterator.hasNext()) {
-//                Page page = pageIterator.next();
-//                List<Table> tables = extractionAlgorithm.extract(page);
-//                tables = tables.stream().distinct().collect(Collectors.toList());
-//                for (Table table : tables) {
-//                    if (table.getColCount() == 4) {
-//                        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
-//                        for (int i = 0; i < table.getRows().size(); i++) {
-//                            List<RectangularTextContainer> cols = table.getRows().get(i);
-//                            for (int j = 0; j < 2; j++) {
-//                                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
-//                            }
-//                        }
-//                        ReportFundInfoDTO reportFundInfo = new ReportFundInfoDTO();
-//                        baseInfoMap.forEach((k, v) -> {
-//                            for (ValueLabelVO vo : fieldMapper) {
-//                                String fieldName = vo.getValue();
-//                                List<String> labels = StrUtil.split(vo.getLabel(), ",");
-//                                if (labels.contains(k)) {
-//                                    ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
-//                                    break;
-//                                }
-//                                for (String label : labels) {
-//                                    if (k.contains(label)) {
-//                                        ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
-//                                        break;
-//                                    }
-//                                }
-//                            }
-//                        });
-//                        System.out.println(reportFundInfo);
-//                    }
-//                }
-//            }
-//        }
-//    }
-//
-//    /**
-//     * 找图片水印
-//     *
-//     * @param page
-//     * @return
-//     * @throws IOException
-//     */
-//    public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
-//        Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
-//        PDResources resources = page.getResources();
-//        Iterable<COSName> xObjectNames = resources.getXObjectNames();
-//        for (COSName xObjectName : xObjectNames) {
-//            PDXObject xObject = resources.getXObject(xObjectName);
-//            PDStream stream = xObject.getStream();
-//            PDImageXObject imageXObject = null;
-//            try {
-//                imageXObject = new PDImageXObject(stream, resources);
-//            } catch (Exception e) {
-//                e.printStackTrace();
-//            }
-//            if (imageXObject != null) {
-//                watermarkMap.put(xObjectName, imageXObject);
-//            }
-//        }
-//        return watermarkMap;
-//    }
-//
-//    private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
-//        Map<String, List<String>> result = MapUtil.newHashMap(32);
-//        // 生成水印列表
-//
-//        fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
-//        trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
-//        registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
-//        String text = fundName + trustName + registerNumber;
-//        text = text.replaceAll("[()]", ""); // 移除括号
-//        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
-//        Collections.reverse(textList);
-//        StringBuilder sb = new StringBuilder(textList.size());
-//        for (String ch : textList) {
-//            sb.append(ch);
-//        }
-//        String joinedText = sb.toString();
-//
-//        // 基本水印列表
-//        List<String> wkList = new ArrayList<>();
-//        for (String ch : textList) {
-//            wkList.add(ch + "\r\n");
-//            wkList.add("\r\n" + ch);
-//        }
-//
-//        // 查找数字
-//        List<String> matches = findDigits(fundName);
-//        if (!matches.isEmpty()) {
-//            for (String match : matches) {
-//                wkList.add("\r\n" + match);
-//                wkList.add(match + "\r\n");
-//            }
-//        }
-//        wkList.add("-");
-//        wkList.add("【");
-//        wkList.add("】");
-//        wkList.add("\r");
-//        wkList.add("\r\n");
-//
-//        String noNumberText = removeDigits(joinedText);
-//
-//        // 生成不同字段的水印列表
-//        result.put("report_name", new ArrayList<>(wkList));
-//        result.get("report_name").addAll(convertStringToList("有限公司"));
-//
-//        result.put("less", new ArrayList<>(wkList));
-//
-//        result.put("more", new ArrayList<>(wkList));
-//        result.get("more").addAll(convertStringToList(noNumberText));
-//
-//        result.put("leverage", new ArrayList<>(wkList));
-//        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
-//
-//        result.put("base_info", new ArrayList<>(wkList));
-//        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
-//
-//        result.put("industry", new ArrayList<>(wkList));
-//        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
-//
-//        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
-//        return result;
-//    }
-//
-//    private static List<String> findDigits(String text) {
-//        List<String> digits = new ArrayList<>();
-//        Pattern pattern = Pattern.compile("\\d");
-//        Matcher matcher = pattern.matcher(text);
-//        while (matcher.find()) {
-//            digits.add(matcher.group());
-//        }
-//        return digits;
-//    }
-//
-//    private static String removeDigits(String text) {
-//        return text.replaceAll("\\d", "");
-//    }
-//
-//    private static String removeKeywords(String text, String... keywords) {
-//        for (String keyword : keywords) {
-//            text = text.replaceAll(keyword, "");
-//        }
-//        return text;
-//    }
-//
-//    private static List<String> convertStringToList(String text) {
-//        List<String> charList = new ArrayList<>();
-//        for (char c : text.toCharArray()) {
-//            charList.add(c + "");
-//        }
-//        return charList;
-//    }
-//
-//    public static String processString(List<String> wmList, String string) {
-//        // 生成正则表达式模式
-//        String pat = String.join("|", wmList);
-//        // 使用正则表达式移除wmList中的元素
-//        string = removeMatches(string, pat);
-//        // 替换中文括号为英文括号
-//        string = string.replace("(", "(").replace(")", ")");
-//        // 移除空格
-//        string = string.replace(" ", "");
-//        // 如果字符串以括号开头,则移除第一个字符
-//        if (startsWithParenthesis(string)) {
-//            string = string.substring(1);
-//        }
-//
-//        return string;
-//    }
-//
-//    private static String removeMatches(String input, String pattern) {
-//        // 编译正则表达式
-//        Pattern compiledPattern = Pattern.compile(pattern);
-//        // 创建Matcher对象
-//        Matcher matcher = compiledPattern.matcher(input);
-//        // 使用replaceAll方法替换所有匹配到的字符为空字符串
-//        return matcher.replaceAll("");
-//    }
-//
-//    private static boolean startsWithParenthesis(String input) {
-//        // 匹配以括号开头的字符串
-//        Pattern pattern = Pattern.compile("^[()].*");
-//        Matcher matcher = pattern.matcher(input);
-//        return matcher.find();
-//    }
-//
-////    public static void removeTextWatermark(PDPage page) throws IOException {
-////        PDResources resources = page.getResources();
-//////        if (StrUtil.isAllBlank(fundName, trustName)) {
-//////            return;
-//////        }
-////        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
-////        stripper.setSortByPosition(true);
-////        stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
-////        stripper.extractRegions(page);
-////
-////        PDFStreamEngine engine = new PDFTextStripper();
-////        engine.addOperator(new SetMatrix(stripper));
-////
-////    }
-////
-////    private static void processResources(PDResources resources) throws IOException {
-////        for (COSName name : resources.getXObjectNames()) {
-////            PDXObject xobject = resources.getXObject(name);
-////            if (xobject instanceof PDFormXObject) {
-////                PDFormXObject formXObject = (PDFormXObject) xobject;
-////                writeTokensToStream(formXObject.getContentStream(),
-////                        createTokensWithoutText(formXObject));
-////                processResources(formXObject.getResources());
-////            }
-////        }
-////        for (COSName name : resources.getPatternNames()) {
-////            PDAbstractPattern pattern = resources.getPattern(name);
-////            if (pattern instanceof PDTilingPattern) {
-////                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
-////                writeTokensToStream(tilingPattern.getContentStream(),
-////                        createTokensWithoutText(tilingPattern));
-////                processResources(tilingPattern.getResources());
-////            }
-////        }
-////    }
-////
-////    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
-////        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
-////            ContentStreamWriter writer = new ContentStreamWriter(out);
-////            writer.writeTokens(newTokens);
-////        }
-////    }
-////
-////    private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
-////        PDFStreamParser parser = new PDFStreamParser(contentStream);
-////        Object token = parser.parseNextToken();
-////        List<Object> newTokens = new ArrayList<>();
-////        while (token != null) {
-////            if (token instanceof Operator op) {
-////                String opName = op.getName();
-////                if (OperatorName.SET_MATRIX.equals(opName)) {
-////                    // remove the argument to this operator
-////                    newTokens.remove(newTokens.size() - 1);
-////
-////                    token = parser.parseNextToken();
-////                    continue;
-////                }
-////            }
-////            newTokens.add(token);
-////            token = parser.parseNextToken();
-////        }
-////        return newTokens;
-////    }
-//}

+ 5 - 0
service-deploy/src/test/java/com/simuwang/ApplicationTest.java

@@ -45,8 +45,13 @@ public class ApplicationTest {
     @Test
     public void reportTest() {
         MailboxInfoDTO emailInfoDTO = this.buildMailbox();
+<<<<<<< HEAD
         Date startDate = DateUtil.parse("2024-10-14 15:10:30", DateConst.YYYY_MM_DD_HH_MM_SS);
         Date endDate = DateUtil.parse("2024-10-14 17:50:30", DateConst.YYYY_MM_DD_HH_MM_SS);
+=======
+        Date startDate = DateUtil.parse("2024-10-15 11:10:30", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2024-10-15 17:50:30", DateConst.YYYY_MM_DD_HH_MM_SS);
+>>>>>>> origin/develop
         try {
             emailParseService.parseEmail(emailInfoDTO, startDate, endDate);
         } catch (Exception e) {