Explorar o código

Merge branch 'test' of http://112.74.196.215:3000/Tech2/data-daq into test

chenjianhua hai 6 meses
pai
achega
e279c2a7c4
Modificáronse 68 ficheiros con 2591 adicións e 1510 borrados
  1. 2 0
      service-base/src/main/java/com/simuwang/base/common/conts/Constants.java
  2. 34 0
      service-base/src/main/java/com/simuwang/base/common/enums/ReportParserFileType.java
  3. 18 4
      service-base/src/main/java/com/simuwang/base/common/enums/ReportType.java
  4. 36 0
      service-base/src/main/java/com/simuwang/base/common/exception/ReportParseException.java
  5. 5 0
      service-base/src/main/java/com/simuwang/base/config/DaqProperties.java
  6. 1 0
      service-base/src/main/java/com/simuwang/base/mapper/EmailFieldMappingMapper.java
  7. 4 0
      service-base/src/main/java/com/simuwang/base/pojo/dos/EmailFieldMappingDO.java
  8. 3 3
      service-base/src/main/java/com/simuwang/base/pojo/dos/report/ReportAssetAllocationDO.java
  9. 52 2
      service-base/src/main/java/com/simuwang/base/pojo/dos/report/ReportFundInfoDO.java
  10. 4 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/AnnuallyReportData.java
  11. 57 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/BaseReportDTO.java
  12. 32 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/BaseReportLevelDTO.java
  13. 4 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/MonthlyReportData.java
  14. 6 1
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/PythonResult.java
  15. 4 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/QuarterlyReportData.java
  16. 16 9
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportAssetAllocationDTO.java
  17. 9 2
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportBaseInfoDTO.java
  18. 16 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportData.java
  19. 58 21
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportFinancialIndicatorsDTO.java
  20. 66 9
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportFundInfoDTO.java
  21. 14 8
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportInvestmentIndustryDTO.java
  22. 29 20
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportNetReportDTO.java
  23. 30 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportParseStatus.java
  24. 29 0
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportParserParams.java
  25. 25 20
      service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportShareChangeDTO.java
  26. 10 1
      service-base/src/main/java/com/simuwang/shiro/core/jwt/JwtContext.java
  27. 2 1
      service-base/src/main/resources/mapper/EmailFieldMappingMapper.xml
  28. 0 184
      service-daq/src/main/java/com/simuwang/daq/components/AbstractReportParser.java
  29. 23 19
      service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java
  30. 190 0
      service-daq/src/main/java/com/simuwang/daq/components/CustomTabulaTextStripper.java
  31. 0 285
      service-daq/src/main/java/com/simuwang/daq/components/PDMonthlyReportParser.java
  32. 10 9
      service-daq/src/main/java/com/simuwang/daq/components/PythonReportConverter.java
  33. 0 18
      service-daq/src/main/java/com/simuwang/daq/components/ReportParser.java
  34. 117 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java
  35. 33 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParser.java
  36. 69 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParserConstant.java
  37. 32 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParserFactory.java
  38. 330 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java
  39. 156 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java
  40. 89 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDMonthlyReportParser.java
  41. 296 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java
  42. 78 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/AbstractPyReportParser.java
  43. 25 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/PythonAnnuallyReportParser.java
  44. 25 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/PythonMonthlyReportParser.java
  45. 25 0
      service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/PythonQuarterlyReportParser.java
  46. 56 0
      service-daq/src/main/java/com/simuwang/daq/components/report/writer/AbstractReportWriter.java
  47. 1 1
      service-daq/src/main/java/com/simuwang/daq/components/writer/AnnuallyReportWriter.java
  48. 1 1
      service-daq/src/main/java/com/simuwang/daq/components/writer/MonthlyReportWriter.java
  49. 1 1
      service-daq/src/main/java/com/simuwang/daq/components/writer/QuarterlyReportWriter.java
  50. 12 0
      service-daq/src/main/java/com/simuwang/daq/components/report/writer/ReportWriter.java
  51. 1 1
      service-daq/src/main/java/com/simuwang/daq/components/writer/ReportWriterConstant.java
  52. 1 1
      service-daq/src/main/java/com/simuwang/daq/components/writer/ReportWriterFactory.java
  53. 0 73
      service-daq/src/main/java/com/simuwang/daq/components/writer/AbstractReportWriter.java
  54. 0 7
      service-daq/src/main/java/com/simuwang/daq/components/writer/ReportWriter.java
  55. 0 43
      service-daq/src/main/java/com/simuwang/daq/dto/MonthlyReportNavInfo.java
  56. 0 13
      service-daq/src/main/java/com/simuwang/daq/dto/ReportExtInfo.java
  57. 0 18
      service-daq/src/main/java/com/simuwang/daq/dto/ReportFileType.java
  58. 0 291
      service-daq/src/main/java/com/simuwang/daq/dto/ReportFundInfo.java
  59. 0 54
      service-daq/src/main/java/com/simuwang/daq/dto/ReportInfo.java
  60. 97 71
      service-daq/src/main/java/com/simuwang/daq/service/EmailParseService.java
  61. 4 2
      service-daq/src/main/java/com/simuwang/daq/service/ReportEmailParser.java
  62. 0 20
      service-daq/src/main/java/com/simuwang/daq/service/ReportParseService.java
  63. 269 255
      service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java
  64. 61 0
      service-daq/src/main/java/technology/tabula/CustomObjectExtractor.java
  65. 0 12
      service-deploy/pom.xml
  66. 2 0
      service-deploy/src/main/resources/application.yml
  67. 21 9
      service-deploy/src/test/java/com/simuwang/ApplicationTest.java
  68. 0 21
      service-manage/src/main/java/com/simuwang/manage/api/test/ReportParseTestApi.java

+ 2 - 0
service-base/src/main/java/com/simuwang/base/common/conts/Constants.java

@@ -7,6 +7,8 @@ package com.simuwang.base.common.conts;
  * @author ruoyi
  */
 public class Constants {
+    public static final String WATERMARK_REPLACE = "+_+" + System.lineSeparator();
+
     public static final long DEFAULT_SERIAL_ID = 999L;
 
     /**

+ 34 - 0
service-base/src/main/java/com/simuwang/base/common/enums/ReportParserFileType.java

@@ -0,0 +1,34 @@
+package com.simuwang.base.common.enums;
+
+import cn.hutool.core.util.StrUtil;
+
+import java.util.Arrays;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/29 10:57
+ * @description 解析文件格式类型,支持调用python接口解析
+ */
+public enum ReportParserFileType {
+    PDF("pdf"),
+    DOCX("docx"),
+    DOC("doc"),
+    XLSX("xlsx"),
+    XLS("xls"),
+    PYTHON("python");
+
+    private final String suffix;
+
+    ReportParserFileType(String suffix) {
+        this.suffix = suffix;
+    }
+
+    public static ReportParserFileType getBySuffix(String suffix) {
+        return Arrays.stream(ReportParserFileType.values())
+                .filter(e -> StrUtil.equals(e.getSuffix(), suffix)).findFirst().orElse(null);
+    }
+
+    public String getSuffix() {
+        return suffix;
+    }
+}

+ 18 - 4
service-base/src/main/java/com/simuwang/base/common/enums/ReportType.java

@@ -2,17 +2,31 @@ package com.simuwang.base.common.enums;
 
 import lombok.Getter;
 
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
 @Getter
 public enum ReportType {
-    MONTHLY(0, "月报"),
-    QUARTERLY(1, "季报"),
-    ANNUALLY(2, "年报");
+    MONTHLY(0, "月报", new String[]{"月", "月度", "月报"}),
+    QUARTERLY(1, "季报", new String[]{"季", "季度", "季报"}),
+    ANNUALLY(2, "年报", new String[]{"年", "年度", "年报"});
 
     private final int type;
     private final String label;
+    private final String[] patterns;
 
-    ReportType(int type, String label) {
+    ReportType(int type, String label, String[] patterns) {
         this.type = type;
         this.label = label;
+        this.patterns = patterns;
+    }
+
+    public static String getAllPatterns() {
+        return String.join("|", patterns());
+    }
+
+    public static List<String> patterns() {
+        return Arrays.stream(ReportType.values()).flatMap(e -> Arrays.stream(e.getPatterns())).collect(Collectors.toList());
     }
 }

+ 36 - 0
service-base/src/main/java/com/simuwang/base/common/exception/ReportParseException.java

@@ -0,0 +1,36 @@
+package com.simuwang.base.common.exception;
+
+import cn.hutool.core.util.StrUtil;
+import com.smppw.common.pojo.enums.status.StatusCode;
+
+/**
+ * @author wangzaijun
+ * @date 2024/10/11 14:10
+ * @description 报告解析的异常
+ */
+public class ReportParseException extends RuntimeException {
+    private final Integer code;
+    private final String msg;
+
+    public ReportParseException(StatusCode statusCode) {
+        this(statusCode.getCode(), statusCode.getMsg());
+    }
+
+    public ReportParseException(Integer code, String msg) {
+        super(msg);
+        this.code = code;
+        this.msg = msg;
+    }
+
+    public ReportParseException(StatusCode statusCode, Object... msgs) {
+        this(statusCode.getCode(), StrUtil.format(statusCode.getMsg(), msgs));
+    }
+
+    public int getCode() {
+        return code;
+    }
+
+    public String getMsg() {
+        return msg;
+    }
+}

+ 5 - 0
service-base/src/main/java/com/simuwang/base/config/DaqProperties.java

@@ -32,6 +32,11 @@ public class DaqProperties {
      */
     private String tokenSecret;
     /**
+     * 是否开启python的报告解析功能,开启后报告全部用python接口来解析
+     * 当开启时要配置python解析地址
+     */
+    private Boolean enablePyParser = Boolean.FALSE;
+    /**
      * 报告解析的python接口地址
      */
     private String pyBaseUrl = "http://localhost:8080";

+ 1 - 0
service-base/src/main/java/com/simuwang/base/mapper/EmailFieldMappingMapper.java

@@ -11,6 +11,7 @@ public interface EmailFieldMappingMapper {
     /**
      * 获取净值文件字段识别映射配置
      *
+     * @param type 0-公共的字段,1-净值和估值表解析的字段,3-定期报告解析的字段
      * @return 净值文件字段识别映射配置
      */
     List<EmailFieldMappingDO> getEmailFieldMapping(Integer type);

+ 4 - 0
service-base/src/main/java/com/simuwang/base/pojo/dos/EmailFieldMappingDO.java

@@ -26,6 +26,10 @@ public class EmailFieldMappingDO {
     @TableField(value = "name")
     private String name;
     /**
+     * 1-净值或估值表,3-定期报告,0-表示共用的,默认0
+     */
+    private Integer type;
+    /**
      * 记录的有效性;1-有效;0-无效;
      */
     @TableField(value = "isvalid")

+ 3 - 3
service-base/src/main/java/com/simuwang/base/pojo/dos/report/ReportAssetAllocationDO.java

@@ -16,15 +16,15 @@ import java.math.BigDecimal;
 @TableName("amac_report_asset_allocation")
 public class ReportAssetAllocationDO extends BaseReportDO {
     /**
-     * 资产类
+     * 资产
      */
     private String assetType;
     /**
-     * 资产类别
+     * 资产明细
      */
     private String columnName;
     /**
-     * 资产类别
+     * 市值
      */
     private BigDecimal marketValue;
     /**

+ 52 - 2
service-base/src/main/java/com/simuwang/base/pojo/dos/report/ReportFundInfoDO.java

@@ -17,23 +17,73 @@ import java.util.Date;
 @Getter
 @TableName("amac_report_fund_info")
 public class ReportFundInfoDO extends BaseReportDO {
+    /**
+     * 投资顾问
+     */
     private String advisorName;
+    /**
+     * 基金托管人
+     */
     private String custodianName;
+    /**
+     * 基金经理描述
+     */
     private String fundManager;
+    /**
+     * 基金名称
+     */
     private String fundName;
+    /**
+     * 投资策略
+     */
     private String fundStrategyDescription;
+    /**
+     * 基金成立日期
+     */
     private Date inceptionDate;
+    /**
+     * 行业趋势
+     */
     private String industryTrend;
+    /**
+     * 投资目标
+     */
     private String investmentObjective;
+    /**
+     * 杠杆比例
+     */
     private BigDecimal leverage;
+    /**
+     * 杠杆比例描述
+     */
     private String leverageNote;
+    /**
+     * 基金运作方式
+     */
     private String operationType;
+    /**
+     * 备案编码
+     */
     private String registerNumber;
+    /**
+     * 风险收益特征
+     */
     private String riskReturnDesc;
+    /**
+     * 业绩比较基准
+     */
     private String secondaryBenchmark;
+    /**
+     * 基金管理人
+     */
     private String trustName;
-
+    /**
+     * 基金到期日期
+     */
     private Date dueDate;
+    /**
+     * 信息披露报告是否经托管机构复核
+     */
     @TableField(value = "reviewed")
-    private Integer isReviewed;
+    private Integer reviewed;
 }

+ 4 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/AnnuallyReportData.java

@@ -7,6 +7,10 @@ import lombok.Setter;
 @Setter
 @Getter
 public class AnnuallyReportData extends QuarterlyReportData {
+    public AnnuallyReportData(ReportBaseInfoDTO baseInfo, ReportFundInfoDTO fundInfo) {
+        super(baseInfo, fundInfo);
+    }
+
     @Override
     public ReportType getReportType() {
         return ReportType.ANNUALLY;

+ 57 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/BaseReportDTO.java

@@ -1,18 +1,75 @@
 package com.simuwang.base.pojo.dto.report;
 
+import cn.hutool.core.date.DatePattern;
+import cn.hutool.core.date.DateUtil;
+import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.pojo.dos.report.BaseReportDO;
 import lombok.Getter;
 import lombok.Setter;
 
+import java.math.BigDecimal;
+import java.util.Date;
+
+/**
+ * @author wangzaijun
+ * @date 2024/10/9 11:08
+ * @description 抽象的报告数据父类,全部字段用string传递
+ */
 @Setter
 @Getter
 public abstract class BaseReportDTO<T extends BaseReportDO> {
     private Integer fileId;
 
+    public BaseReportDTO() {
+    }
+
+    public BaseReportDTO(Integer fileId) {
+        this.fileId = fileId;
+    }
+
     public abstract T toEntity();
 
     @Override
     public String toString() {
         return "fileId=" + fileId;
     }
+
+    /**
+     * 字符串转日期类型
+     *
+     * @param input 待转换的字符串
+     * @return /
+     */
+    protected Date toDate(String input) {
+        if (StrUtil.isBlank(input)) {
+            return null;
+        }
+        try {
+            // 日期格式化,支持三种格式:yyyy年MM月dd日、yyyy-MM-dd和yyyy/MM/dd
+            return DateUtil.parse(input.trim(),
+                    DatePattern.CHINESE_DATE_PATTERN, DatePattern.NORM_DATE_PATTERN, "yyyy/MM/dd");
+        } catch (Exception ignored) {
+        }
+        return null;
+    }
+
+    /**
+     * 字符串转数字
+     *
+     * @param input 待转换的字符串
+     * @return /
+     */
+    protected BigDecimal toBigDecimal(String input) {
+        if (StrUtil.isBlank(input)) {
+            return null;
+        }
+        try {
+            // 移除所有非数字和“.”字符
+            String cleanedInput = input.trim().replaceAll("[^\\d.]", "");
+            // 创建BigDecimal对象
+            return new BigDecimal(cleanedInput);
+        } catch (NumberFormatException ignored) {
+        }
+        return null;
+    }
 }

+ 32 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/BaseReportLevelDTO.java

@@ -0,0 +1,32 @@
+package com.simuwang.base.pojo.dto.report;
+
+import com.simuwang.base.pojo.dos.report.BaseReportDO;
+import lombok.Getter;
+import lombok.Setter;
+
+@Setter
+@Getter
+public abstract class BaseReportLevelDTO<T extends BaseReportDO> extends BaseReportDTO<T> {
+    /**
+     * 基金分级
+     */
+    private String level;
+
+    public BaseReportLevelDTO() {
+        super();
+    }
+
+    public BaseReportLevelDTO(Integer fileId) {
+        super(fileId);
+    }
+
+    public BaseReportLevelDTO(Integer fileId, String level) {
+        super(fileId);
+        this.level = level;
+    }
+
+    @Override
+    public String toString() {
+        return super.toString() + ", level='" + this.level + "'";
+    }
+}

+ 4 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/MonthlyReportData.java

@@ -11,6 +11,10 @@ import java.util.List;
 public class MonthlyReportData extends ReportData {
     private List<ReportNetReportDTO> netReport;
 
+    public MonthlyReportData(ReportBaseInfoDTO baseInfo, ReportFundInfoDTO fundInfo) {
+        super(baseInfo, fundInfo);
+    }
+
     @Override
     public ReportType getReportType() {
         return ReportType.MONTHLY;

+ 6 - 1
service-base/src/main/java/com/simuwang/base/pojo/dto/report/PythonResult.java

@@ -3,9 +3,14 @@ package com.simuwang.base.pojo.dto.report;
 import lombok.Getter;
 import lombok.Setter;
 
+/**
+ * @author wangzaijun
+ * @date 2024/10/10 14:08
+ * @description 报告解析结果
+ */
 @Setter
 @Getter
-public class PythonResult<T extends ReportData> {
+public class ParseResult<T extends ReportData> {
     private Integer status;
 
     private String msg;

+ 4 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/QuarterlyReportData.java

@@ -19,6 +19,10 @@ public class QuarterlyReportData extends ReportData {
     private List<ReportInvestmentIndustryDTO> investmentIndustry;
     private List<ReportShareChangeDTO> shareChange;
 
+    public QuarterlyReportData(ReportBaseInfoDTO baseInfo, ReportFundInfoDTO fundInfo) {
+        super(baseInfo, fundInfo);
+    }
+
     @Override
     public ReportType getReportType() {
         return ReportType.QUARTERLY;

+ 16 - 9
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportAssetAllocationDTO.java

@@ -4,8 +4,6 @@ import com.simuwang.base.pojo.dos.report.ReportAssetAllocationDO;
 import lombok.Getter;
 import lombok.Setter;
 
-import java.math.BigDecimal;
-
 /**
  * @author wangzaijun
  * @date 2024/9/26 16:43
@@ -15,28 +13,37 @@ import java.math.BigDecimal;
 @Getter
 public class ReportAssetAllocationDTO extends BaseReportDTO<ReportAssetAllocationDO> {
     /**
-     * 资产类
+     * 资产
      */
     private String assetType;
     /**
-     * 资产类别
+     * 资产明细
      */
-    private String columnName;
+    private String assetDetails;
     /**
-     * 资产类别
+     * 市值
      */
-    private BigDecimal marketValue;
+    private String marketValue;
     /**
      * 备注
      */
     private String remark;
 
+    public ReportAssetAllocationDTO() {
+        super();
+    }
+
+    public ReportAssetAllocationDTO(Integer fileId) {
+        super(fileId);
+    }
+
     @Override
     public ReportAssetAllocationDO toEntity() {
         ReportAssetAllocationDO entity = new ReportAssetAllocationDO();
         entity.setFileId(this.getFileId());
         entity.setAssetType(this.assetType);
-        entity.setMarketValue(this.marketValue);
+        entity.setColumnName(this.assetDetails);
+        entity.setMarketValue(this.toBigDecimal(this.marketValue));
         entity.setRemark(this.remark);
         return entity;
     }
@@ -46,7 +53,7 @@ public class ReportAssetAllocationDTO extends BaseReportDTO<ReportAssetAllocatio
         return "{" +
                 super.toString() +
                 ", assetType='" + assetType + '\'' +
-                ", columnName='" + columnName + '\'' +
+                ", assetDetails='" + assetDetails + '\'' +
                 ", marketValue=" + marketValue +
                 ", remark='" + remark + '\'' +
                 '}';

+ 9 - 2
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportBaseInfoDTO.java

@@ -1,6 +1,5 @@
 package com.simuwang.base.pojo.dto.report;
 
-import cn.hutool.core.date.DateUtil;
 import com.simuwang.base.pojo.dos.report.ReportBaseInfoDO;
 import lombok.Getter;
 import lombok.Setter;
@@ -26,11 +25,19 @@ public class ReportBaseInfoDTO extends BaseReportDTO<ReportBaseInfoDO> {
      */
     private String reportType;
 
+    public ReportBaseInfoDTO() {
+        super();
+    }
+
+    public ReportBaseInfoDTO(Integer fileId) {
+        super(fileId);
+    }
+
     @Override
     public ReportBaseInfoDO toEntity() {
         ReportBaseInfoDO entity = new ReportBaseInfoDO();
         entity.setFileId(this.getFileId());
-        entity.setReportDate(this.reportDate == null ? null : DateUtil.parseDate(this.reportDate));
+        entity.setReportDate(this.toDate(this.reportDate));
         entity.setReportName(this.reportName);
         entity.setReportType(this.reportType);
         return entity;

+ 16 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportData.java

@@ -4,12 +4,28 @@ import com.simuwang.base.common.enums.ReportType;
 import lombok.Getter;
 import lombok.Setter;
 
+/**
+ * @author wangzaijun
+ * @date 2024/9/29 9:32
+ * @description 报告解析结果对象
+ */
 @Setter
 @Getter
 public abstract class ReportData {
+    /**
+     * 报告基本信息
+     */
     private ReportBaseInfoDTO baseInfo;
+    /**
+     * 报告包含的基金基本新
+     */
     private ReportFundInfoDTO fundInfo;
 
+    public ReportData(ReportBaseInfoDTO baseInfo, ReportFundInfoDTO fundInfo) {
+        this.baseInfo = baseInfo;
+        this.fundInfo = fundInfo;
+    }
+
     public abstract ReportType getReportType();
 
     @Override

+ 58 - 21
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportFinancialIndicatorsDTO.java

@@ -1,41 +1,77 @@
 package com.simuwang.base.pojo.dto.report;
 
+import cn.hutool.core.util.StrUtil;
 import com.simuwang.base.pojo.dos.report.ReportFinancialIndicatorsDO;
 import lombok.Getter;
 import lombok.Setter;
 
-import java.math.BigDecimal;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 @Setter
 @Getter
-public class ReportFinancialIndicatorsDTO extends BaseReportDTO<ReportFinancialIndicatorsDO> {
-    private String level;
-
+public class ReportFinancialIndicatorsDTO extends BaseReportLevelDTO<ReportFinancialIndicatorsDO> {
     /**
      * 年度
      */
-    private Integer endDate;
-
-    private BigDecimal fundAssetSize;
-    private BigDecimal nav;
-    private BigDecimal profit;
-    private BigDecimal realizedIncome;
+    private String yearly;
+    /**
+     * 期末基金净资产
+     */
+    private String assetNet;
+    /**
+     * 报告期期末单位净值
+     */
+    private String nav;
+    /**
+     * 本期利润
+     */
+    private String profit;
+    /**
+     * 本期已实现收益
+     */
+    private String realizedIncome;
     /**
      * 期末可供分配利润
      */
-    private BigDecimal undistributedProfit;
+    private String undistributedProfit;
+    /**
+     * 期末可供分配基金份额利润
+     */
+    private String undistributedShareProfit;
+    /**
+     * 基金份额累计净值增长率
+     */
+    private String shareNavRet;
+
+    public ReportFinancialIndicatorsDTO() {
+        super();
+    }
+
+    public ReportFinancialIndicatorsDTO(Integer fileId) {
+        super(fileId);
+    }
+
+    public ReportFinancialIndicatorsDTO(Integer fileId, String level) {
+        super(fileId, level);
+    }
 
     @Override
     public ReportFinancialIndicatorsDO toEntity() {
         ReportFinancialIndicatorsDO entity = new ReportFinancialIndicatorsDO();
         entity.setFileId(this.getFileId());
-        entity.setLevel(this.level);
-        entity.setEndDate(this.endDate);
-        entity.setFundAssetSize(this.fundAssetSize);
-        entity.setNav(this.nav);
-        entity.setProfit(this.profit);
-        entity.setRealizedIncome(this.realizedIncome);
-        entity.setUndistributedProfit(this.undistributedProfit);
+        entity.setLevel(this.getLevel());
+        entity.setFundAssetSize(this.toBigDecimal(this.assetNet));
+        entity.setNav(this.toBigDecimal(this.nav));
+        entity.setProfit(this.toBigDecimal(this.profit));
+        entity.setRealizedIncome(this.toBigDecimal(this.realizedIncome));
+        entity.setUndistributedProfit(this.toBigDecimal(this.undistributedProfit));
+        if (StrUtil.isNotBlank(this.yearly)) {
+            Matcher matcher = Pattern.compile("\\d+").matcher(this.yearly);
+            if (matcher.find()) {
+                entity.setEndDate(Integer.parseInt(matcher.group()));
+            }
+        }
         return entity;
     }
 
@@ -43,13 +79,14 @@ public class ReportFinancialIndicatorsDTO extends BaseReportDTO<ReportFinancialI
     public String toString() {
         return "{" +
                 super.toString() +
-                ", level='" + level + '\'' +
-                ", endDate=" + endDate +
-                ", fundAssetSize=" + fundAssetSize +
+                ", yearly=" + yearly +
+                ", assetNet=" + assetNet +
                 ", nav=" + nav +
                 ", profit=" + profit +
                 ", undistributedProfit=" + undistributedProfit +
                 ", realizedIncome=" + realizedIncome +
+                ", undistributedShareProfit=" + undistributedShareProfit +
+                ", shareNavRet=" + shareNavRet +
                 '}';
     }
 }

+ 66 - 9
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportFundInfoDTO.java

@@ -1,11 +1,10 @@
 package com.simuwang.base.pojo.dto.report;
 
-import cn.hutool.core.date.DateUtil;
 import com.simuwang.base.pojo.dos.report.ReportFundInfoDO;
 import lombok.Getter;
 import lombok.Setter;
 
-import java.math.BigDecimal;
+import java.util.Objects;
 
 /**
  * @author wangzaijun
@@ -15,24 +14,82 @@ import java.math.BigDecimal;
 @Setter
 @Getter
 public class ReportFundInfoDTO extends BaseReportDTO<ReportFundInfoDO> {
+    /**
+     * 投资顾问
+     */
     private String advisorName;
+    /**
+     * 基金托管人
+     */
     private String custodianName;
+    /**
+     * 基金经理描述
+     */
     private String fundManager;
+    /**
+     * 基金名称
+     */
     private String fundName;
+    /**
+     * 投资策略
+     */
     private String fundStrategyDescription;
+    /**
+     * 基金成立日期
+     */
     private String inceptionDate;
+    /**
+     * 行业趋势
+     */
     private String industryTrend;
+    /**
+     * 投资目标
+     */
     private String investmentObjective;
-    private BigDecimal leverage;
+    /**
+     * 杠杆比例
+     */
+    private String leverage;
+    /**
+     * 杠杆比例描述
+     */
     private String leverageNote;
+    /**
+     * 基金运作方式
+     */
     private String operationType;
+    /**
+     * 备案编码
+     */
     private String registerNumber;
+    /**
+     * 风险收益特征
+     */
     private String riskReturnDesc;
+    /**
+     * 业绩比较基准
+     */
     private String secondaryBenchmark;
+    /**
+     * 基金管理人
+     */
     private String trustName;
-
+    /**
+     * 基金到期日期
+     */
     private String dueDate;
-    private Integer isReviewed;
+    /**
+     * 信息披露报告是否经托管机构复核
+     */
+    private String isReviewed;
+
+    public ReportFundInfoDTO() {
+        super();
+    }
+
+    public ReportFundInfoDTO(Integer fileId) {
+        super(fileId);
+    }
 
     @Override
     public ReportFundInfoDO toEntity() {
@@ -43,18 +100,18 @@ public class ReportFundInfoDTO extends BaseReportDTO<ReportFundInfoDO> {
         entity.setFundManager(this.fundManager);
         entity.setFundName(this.fundName);
         entity.setFundStrategyDescription(this.fundStrategyDescription);
-        entity.setInceptionDate(this.inceptionDate == null ? null : DateUtil.parseDate(this.inceptionDate));
+        entity.setInceptionDate(this.toDate(this.inceptionDate));
         entity.setIndustryTrend(this.industryTrend);
         entity.setInvestmentObjective(this.investmentObjective);
-        entity.setLeverage(this.leverage);
+        entity.setLeverage(this.toBigDecimal(this.leverage));
         entity.setLeverageNote(this.leverageNote);
         entity.setOperationType(this.operationType);
         entity.setRegisterNumber(this.registerNumber);
         entity.setRiskReturnDesc(this.riskReturnDesc);
         entity.setSecondaryBenchmark(this.secondaryBenchmark);
         entity.setTrustName(this.trustName);
-        entity.setDueDate(this.dueDate == null ? null : DateUtil.parseDate(this.dueDate));
-        entity.setIsReviewed(this.isReviewed);
+        entity.setDueDate(this.toDate(this.dueDate));
+        entity.setReviewed(Objects.equals("是", this.isReviewed) ? 1 : 0);
         return entity;
     }
 

+ 14 - 8
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportInvestmentIndustryDTO.java

@@ -4,8 +4,6 @@ import com.simuwang.base.pojo.dos.report.ReportInvestmentIndustryDO;
 import lombok.Getter;
 import lombok.Setter;
 
-import java.math.BigDecimal;
-
 /**
  * @author wangzaijun
  * @date 2024/9/26 16:49
@@ -31,13 +29,21 @@ public class ReportInvestmentIndustryDTO extends BaseReportDTO<ReportInvestmentI
      */
     private String isbCode;
     /**
-     * 公允价值
+     * 公允价值,市值
      */
-    private BigDecimal marketValue;
+    private String marketValue;
     /**
-     * 占基金资产净值的比例
+     * 占基金资产净值的比例,占净值比,权重
      */
-    private BigDecimal ratio;
+    private String ratio;
+
+    public ReportInvestmentIndustryDTO() {
+        super();
+    }
+
+    public ReportInvestmentIndustryDTO(Integer fileId) {
+        super(fileId);
+    }
 
     @Override
     public ReportInvestmentIndustryDO toEntity() {
@@ -47,8 +53,8 @@ public class ReportInvestmentIndustryDTO extends BaseReportDTO<ReportInvestmentI
         entity.setIndustryName(this.industryName);
         entity.setInvestType(this.investType);
         entity.setIsbCode(this.isbCode);
-        entity.setMarketValue(this.marketValue);
-        entity.setRatio(this.ratio);
+        entity.setMarketValue(this.toBigDecimal(this.marketValue));
+        entity.setRatio(this.toBigDecimal(this.ratio));
         return entity;
     }
 

+ 29 - 20
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportNetReportDTO.java

@@ -1,12 +1,9 @@
 package com.simuwang.base.pojo.dto.report;
 
-import cn.hutool.core.date.DateUtil;
 import com.simuwang.base.pojo.dos.report.ReportNetReportDO;
 import lombok.Getter;
 import lombok.Setter;
 
-import java.math.BigDecimal;
-
 /**
  * @author wangzaijun
  * @date 2024/9/26 16:53
@@ -14,37 +11,50 @@ import java.math.BigDecimal;
  */
 @Setter
 @Getter
-public class ReportNetReportDTO extends BaseReportDTO<ReportNetReportDO> {
-    private String level;
+public class ReportNetReportDTO extends BaseReportLevelDTO<ReportNetReportDO> {
+    /**
+     * 估值日期
+     */
     private String valuationDate;
-
     /**
      * 累计净值
      */
-    private BigDecimal cumulativeNav;
+    private String cumulativeNavWithdrawal;
     /**
      * 基金份额总额
      */
-    private BigDecimal endTotalShares;
+    private String assetShare;
     /**
      * 基金资产净值
      */
-    private BigDecimal fundAssetSize;
+    private String assetNet;
     /**
      * 单位净值
      */
-    private BigDecimal nav;
+    private String nav;
+
+    public ReportNetReportDTO() {
+        super();
+    }
+
+    public ReportNetReportDTO(Integer fileId) {
+        super(fileId);
+    }
+
+    public ReportNetReportDTO(Integer fileId, String level) {
+        super(fileId, level);
+    }
 
     @Override
     public ReportNetReportDO toEntity() {
         ReportNetReportDO entity = new ReportNetReportDO();
         entity.setFileId(this.getFileId());
-        entity.setLevel(this.level);
-        entity.setValuationDate(this.valuationDate == null ? null : DateUtil.parseDate(this.valuationDate));
-        entity.setCumulativeNav(this.cumulativeNav);
-        entity.setEndTotalShares(this.endTotalShares);
-        entity.setFundAssetSize(this.fundAssetSize);
-        entity.setNav(this.nav);
+        entity.setLevel(this.getLevel());
+        entity.setValuationDate(this.toDate(this.valuationDate));
+        entity.setCumulativeNav(this.toBigDecimal(this.cumulativeNavWithdrawal));
+        entity.setEndTotalShares(this.toBigDecimal(this.assetShare));
+        entity.setFundAssetSize(this.toBigDecimal(this.assetNet));
+        entity.setNav(this.toBigDecimal(this.nav));
         return entity;
     }
 
@@ -52,11 +62,10 @@ public class ReportNetReportDTO extends BaseReportDTO<ReportNetReportDO> {
     public String toString() {
         return "{" +
                 super.toString() +
-                ", level='" + level + '\'' +
                 ", valuationDate='" + valuationDate + '\'' +
-                ", cumulativeNav=" + cumulativeNav +
-                ", endTotalShares=" + endTotalShares +
-                ", fundAssetSize=" + fundAssetSize +
+                ", cumulativeNavWithdrawal=" + cumulativeNavWithdrawal +
+                ", assetShare=" + assetShare +
+                ", fundAssetSize=" + assetNet +
                 ", nav=" + nav +
                 '}';
     }

+ 30 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportParseStatus.java

@@ -0,0 +1,30 @@
+package com.simuwang.base.pojo.dto.report;
+
+import com.smppw.common.pojo.enums.status.StatusCode;
+
+public enum ReportParseStatus implements StatusCode {
+    PARSE_FAIL(21000, "定期报告解析错误:{}"),
+    NOT_A_REPORT(21001, "不是定期报告"),
+    REPORT_IS_SCAN(21002, "报告为扫描件"),
+    NO_SUPPORT_TEMPLATE(21003, "不支持的报告文件格式"),
+    NOT_A_FIXED_FORMAT(21004, "不是基协统一格式"),
+    PARSE_FUND_INFO_FAIL(21010, "没有解析到报告中的基金基本信息"),
+    ;
+    private final int code;
+    private final String msg;
+
+    ReportParseStatus(int code, String msg) {
+        this.code = code;
+        this.msg = msg;
+    }
+
+    @Override
+    public int getCode() {
+        return this.code;
+    }
+
+    @Override
+    public String getMsg() {
+        return this.msg;
+    }
+}

+ 29 - 0
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportParserParams.java

@@ -0,0 +1,29 @@
+package com.simuwang.base.pojo.dto.report;
+
+import lombok.*;
+
+@Getter
+@Builder
+@NoArgsConstructor
+@AllArgsConstructor
+@ToString
+public class ReportParserParams {
+    /**
+     * 文件id
+     * 报告解析表的关联字段
+     */
+    private Integer fileId;
+    /**
+     * 文件名称
+     * 优先从这个名称里先获取基金备案编码,没有就不获取
+     */
+    private String filename;
+    /**
+     * 文件路径
+     */
+    private String filepath;
+    /**
+     * 备案编码
+     */
+    private String registerNumber;
+}

+ 25 - 20
service-base/src/main/java/com/simuwang/base/pojo/dto/report/ReportShareChangeDTO.java

@@ -4,8 +4,6 @@ import com.simuwang.base.pojo.dos.report.ReportShareChangeDO;
 import lombok.Getter;
 import lombok.Setter;
 
-import java.math.BigDecimal;
-
 /**
  * @author wangzaijun
  * @date 2024/9/26 16:40
@@ -13,42 +11,50 @@ import java.math.BigDecimal;
  */
 @Setter
 @Getter
-public class ReportShareChangeDTO extends BaseReportDTO<ReportShareChangeDO> {
-    /**
-     * 基金分级
-     */
-    private String level;
+public class ReportShareChangeDTO extends BaseReportLevelDTO<ReportShareChangeDO> {
     /**
      * 报告期期初基金份额总额
      */
-    private BigDecimal initTotalShares;
+    private String initTotalShares;
     /**
      * 减: 报告期期间基金总赎回份额
      */
-    private BigDecimal redemption;
+    private String redemption;
     /**
      * 期末基金总份额/期末基金实缴总额
      */
-    private BigDecimal sharePerAsset;
+    private String sharePerAsset;
     /**
      * 报告期期间基金拆分变动份额
      */
-    private BigDecimal split;
+    private String splitChangeShare;
     /**
      * 报告期期间基金总申购份额
      */
-    private BigDecimal subscription;
+    private String subscription;
+
+    public ReportShareChangeDTO() {
+        super();
+    }
+
+    public ReportShareChangeDTO(Integer fileId) {
+        super(fileId);
+    }
+
+    public ReportShareChangeDTO(Integer fileId, String level) {
+        super(fileId, level);
+    }
 
     @Override
     public ReportShareChangeDO toEntity() {
         ReportShareChangeDO entity = new ReportShareChangeDO();
         entity.setFileId(this.getFileId());
-        entity.setLevel(this.level);
-        entity.setRedemption(this.redemption);
-        entity.setInitTotalShares(this.initTotalShares);
-        entity.setSharePerAsset(this.sharePerAsset);
-        entity.setSplit(this.split);
-        entity.setSubscription(this.subscription);
+        entity.setLevel(this.getLevel());
+        entity.setRedemption(this.toBigDecimal(this.redemption));
+        entity.setInitTotalShares(this.toBigDecimal(this.initTotalShares));
+        entity.setSharePerAsset(this.toBigDecimal(this.sharePerAsset));
+        entity.setSplit(this.toBigDecimal(this.splitChangeShare));
+        entity.setSubscription(this.toBigDecimal(this.subscription));
         return entity;
     }
 
@@ -56,11 +62,10 @@ public class ReportShareChangeDTO extends BaseReportDTO<ReportShareChangeDO> {
     public String toString() {
         return "{" +
                 super.toString() +
-                ", level='" + level + '\'' +
                 ", initTotalShares=" + initTotalShares +
                 ", redemption=" + redemption +
                 ", sharePerAsset=" + sharePerAsset +
-                ", split=" + split +
+                ", splitChangeShare=" + splitChangeShare +
                 ", subscription=" + subscription +
                 '}';
     }

+ 10 - 1
service-base/src/main/java/com/simuwang/shiro/core/jwt/JwtContext.java

@@ -6,6 +6,7 @@ import com.github.benmanes.caffeine.cache.Cache;
 import com.github.benmanes.caffeine.cache.Caffeine;
 import com.simuwang.base.config.DaqProperties;
 import io.jsonwebtoken.Claims;
+import io.jsonwebtoken.ExpiredJwtException;
 import io.jsonwebtoken.Jwts;
 import io.jsonwebtoken.security.Keys;
 import org.springframework.stereotype.Component;
@@ -38,7 +39,7 @@ public class JwtContext {
         if (MapUtil.isEmpty(tokenMap)) {
             tokenMap = MapUtil.newConcurrentHashMap(16);
         }
-        tokenMap.putIfAbsent(requestIP, token);
+        tokenMap.put(requestIP, token);
         USER_TOKEN_CACHE.put(username, tokenMap);
     }
 
@@ -75,6 +76,14 @@ public class JwtContext {
      */
     public synchronized String generateToken(String username, String requestIp) {
         String token = this.getUserCache(username, requestIp);
+        if (StrUtil.isNotBlank(token)) {
+            try {
+                // 如果token过期则重新生成
+                this.getClaimsByToken(token);
+            } catch (ExpiredJwtException e) {
+                token = null;
+            }
+        }
         if (StrUtil.isBlank(token)) {
             SecretKey signingKey = Keys.hmacShaKeyFor(this.properties.getTokenSecret().getBytes(StandardCharsets.UTF_8));
             //过期时间

+ 2 - 1
service-base/src/main/resources/mapper/EmailFieldMappingMapper.xml

@@ -5,6 +5,7 @@
         <id column="id" property="id"/>
         <result column="code" property="code"/>
         <result column="name" property="name"/>
+        <result column="type" property="type"/>
         <result column="isvalid" property="isvalid"/>
         <result column="creatorid" property="creatorId"/>
         <result column="createtime" property="createTime"/>
@@ -17,7 +18,7 @@
         from PPW_EMAIL.email_field_mapping
         where isvalid = 1
         <if test="type != null">
-            and(TYPE =#{type} or TYPE = 0)
+            and (TYPE = #{type} or TYPE = 0)
         </if>
     </select>
 

+ 0 - 184
service-daq/src/main/java/com/simuwang/daq/components/AbstractReportParser.java

@@ -1,184 +0,0 @@
-package com.simuwang.daq.components;
-
-import cn.hutool.core.exceptions.ExceptionUtil;
-import cn.hutool.core.map.MapUtil;
-import cn.hutool.core.util.StrUtil;
-import com.simuwang.daq.dto.ReportExtInfo;
-import com.simuwang.daq.dto.ReportFundInfo;
-import com.simuwang.daq.dto.ReportInfo;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.util.StopWatch;
-
-import java.io.IOException;
-import java.util.*;
-import java.util.concurrent.TimeUnit;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public abstract class AbstractReportParser<EXT extends ReportExtInfo> implements ReportParser {
-    protected final Logger logger = LoggerFactory.getLogger(this.getClass());
-    protected String filepath;
-    protected Map<String, List<String>> watermarkListMap;
-
-    @Override
-    public void parse(Integer fileId, String filepath, String watermarkName) {
-        StopWatch watch = new StopWatch();
-        watch.start();
-        if (this.logger.isInfoEnabled()) {
-            this.logger.info("报告{} 开始解析!", filepath);
-        }
-        this.filepath = filepath;
-        this.watermarkListMap = this.generateWatermarkMap(watermarkName);
-        ReportInfo reportInfo = null;
-        ReportFundInfo reportFundInfo = null;
-        List<EXT> exts = null;
-        try {
-            this.initParse();
-            reportInfo = this.parseReportInfo(fileId);
-            reportFundInfo = this.parseBaseInfo();
-            exts = this.parseExtInfo();
-        } catch (Exception e) {
-            this.logger.error("报告{} 解析错误\n{}", filepath, ExceptionUtil.stacktraceToString(e));
-        }
-        this.saveResult(reportInfo, reportFundInfo, exts);
-        watch.stop();
-        if (this.logger.isInfoEnabled()) {
-            this.logger.info("报告{} 解析结束!耗时:{}s", filepath, watch.getTotalTime(TimeUnit.SECONDS));
-        }
-    }
-
-    protected abstract void initParse() throws IOException;
-
-    protected abstract ReportInfo parseReportInfo(Integer fileId);
-
-    protected abstract ReportFundInfo parseBaseInfo();
-
-    protected abstract List<EXT> parseExtInfo();
-
-    protected abstract void saveResult(ReportInfo reportInfo, ReportFundInfo reportFundInfo, List<EXT> exts);
-
-    private Map<String, List<String>> generateWatermarkMap(String watermarkName) {
-        Map<String, List<String>> result = MapUtil.newHashMap(32);
-        // 生成水印列表
-        String text = watermarkName;
-        text = text.replaceAll("[()]", ""); // 移除括号
-        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
-        Collections.reverse(textList);
-        StringBuilder sb = new StringBuilder(textList.size());
-        for (String ch : textList) {
-            sb.append(ch);
-        }
-        String joinedText = sb.toString();
-
-        // 基本水印列表
-        List<String> wkList = new ArrayList<>();
-        for (String ch : textList) {
-            wkList.add(ch + "\r\n");
-            wkList.add("\r\n" + ch);
-        }
-
-        // 查找数字
-        List<String> matches = findDigits(watermarkName);
-        if (!matches.isEmpty()) {
-            for (String match : matches) {
-                wkList.add("\r\n" + match);
-                wkList.add(match + "\r\n");
-            }
-        }
-        wkList.add("-");
-        wkList.add("【");
-        wkList.add("】");
-        wkList.add("\r");
-        wkList.add("\n");
-        wkList.add("\r\n");
-
-        String noNumberText = removeDigits(joinedText);
-
-        // 生成不同字段的水印列表
-        result.put("report_name", new ArrayList<>(wkList));
-        result.get("report_name").addAll(convertStringToList("有限公司"));
-
-        result.put("less", new ArrayList<>(wkList));
-
-        result.put("more", new ArrayList<>(wkList));
-        result.get("more").addAll(convertStringToList(noNumberText));
-
-        result.put("leverage", new ArrayList<>(wkList));
-        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
-
-        result.put("base_info", new ArrayList<>(wkList));
-        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
-
-        result.put("industry", new ArrayList<>(wkList));
-        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
-
-        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
-        return result;
-    }
-
-    private List<String> findDigits(String text) {
-        List<String> digits = new ArrayList<>();
-        Pattern pattern = Pattern.compile("\\d");
-        Matcher matcher = pattern.matcher(text);
-        while (matcher.find()) {
-            digits.add(matcher.group());
-        }
-        return digits;
-    }
-
-    private String removeDigits(String text) {
-        return text.replaceAll("\\d", "");
-    }
-
-    private String removeKeywords(String text, String... keywords) {
-        for (String keyword : keywords) {
-            text = text.replaceAll(keyword, "");
-        }
-        return text;
-    }
-
-    private List<String> convertStringToList(String text) {
-        List<String> charList = new ArrayList<>();
-        for (char c : text.toCharArray()) {
-            charList.add(c + "");
-        }
-        return charList;
-    }
-
-    protected String processString(List<String> wmList, String string) {
-        if (StrUtil.isBlank(string)) {
-            return null;
-        }
-        // 生成正则表达式模式
-        String pat = String.join("|", wmList);
-        // 使用正则表达式移除wmList中的元素
-        string = removeMatches(string, pat);
-        // 替换中文括号为英文括号
-        string = string.replace("(", "(").replace(")", ")");
-        // 移除空格
-        string = string.replace(" ", "");
-        // 如果字符串以括号开头,则移除第一个字符
-        if (startsWithParenthesis(string)) {
-            string = string.substring(1);
-        }
-
-        return string;
-    }
-
-    private String removeMatches(String input, String pattern) {
-        // 编译正则表达式
-        Pattern compiledPattern = Pattern.compile(pattern);
-        // 创建Matcher对象
-        Matcher matcher = compiledPattern.matcher(input);
-        // 使用replaceAll方法替换所有匹配到的字符为空字符串
-        return matcher.replaceAll("");
-    }
-
-    private boolean startsWithParenthesis(String input) {
-        // 匹配以括号开头的字符串
-        Pattern pattern = Pattern.compile("^[()].*");
-        Matcher matcher = pattern.matcher(input);
-        return matcher.find();
-    }
-}

+ 23 - 19
service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java

@@ -2,41 +2,45 @@ package com.simuwang.daq.components;
 
 import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.util.StrUtil;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
-import org.apache.pdfbox.util.Matrix;
 
 import java.io.IOException;
 import java.util.List;
+import java.util.stream.Collectors;
+
+import static com.simuwang.base.common.conts.Constants.WATERMARK_REPLACE;
 
 /**
  * @author wangzaijun
  * @date 2024/9/12 14:00
  * @description 自定义的文本去水印方法,发现水印基本是旋转文字并且比报告内其他文字都大
+ * @see CustomTabulaTextStripper 区别于表格文字去水印的实现
  */
 public class CustomPDFTextStripper extends PDFTextStripper {
-    private final float[] watermarkWidth = {0f};
-
     @Override
     protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
+        // 水印文字基本都是有角度的,统计有旋转角度的文字高度
+        List<Float> heights = textPositions.stream().filter(e -> e.getTextMatrix().getValue(0, 1) != 0.)
+                .map(TextPosition::getHeight).collect(Collectors.toList());
+        // 集合为空表示text的内容没有水印影响,直接输出该内容
+        if (CollUtil.isEmpty(heights)) {
+            super.writeString(text);
+            return;
+        }
+        // 如果全是水印文字则直接去除
+        if (textPositions.size() == heights.size()) {
+            super.writeString(WATERMARK_REPLACE);
+            return;
+        }
+        // 否则去除水印(文字没有旋转角度,并且水印字体大小没有包含当前文字时说明是正常文字;否则识别为水印并用特殊符号代替)
         List<String> newTexts = ListUtil.list(false);
         for (TextPosition textPosition : textPositions) {
-            Matrix textMatrix = textPosition.getTextMatrix();
-            float col = textMatrix.getValue(0, 1);
-            float width = textPosition.getWidth();
-            if (col == 0.) {
-                if (width < watermarkWidth[0]) {
-                    newTexts.add(textPosition.getUnicode());
-                }
-            } else {
-                if (width > watermarkWidth[0]) {
-                    watermarkWidth[0] = width;
-                }
-                newTexts.add("++");
-            }
-        }
-        if (CollUtil.isNotEmpty(newTexts)) {
-            super.writeString(String.join("", newTexts));
+            float col = textPosition.getTextMatrix().getValue(0, 1);
+            float height = textPosition.getHeight();
+            newTexts.add(col == 0. && !heights.contains(height) ? textPosition.getUnicode() : WATERMARK_REPLACE);
         }
+        super.writeString(String.join(StrUtil.EMPTY, newTexts));
     }
 }

+ 190 - 0
service-daq/src/main/java/com/simuwang/daq/components/CustomTabulaTextStripper.java

@@ -0,0 +1,190 @@
+package com.simuwang.daq.components;
+
+import org.apache.fontbox.util.BoundingBox;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
+import org.apache.pdfbox.pdmodel.font.PDType3Font;
+import org.apache.pdfbox.text.TextPosition;
+import technology.tabula.RectangleSpatialIndex;
+import technology.tabula.TextElement;
+import technology.tabula.TextStripper;
+import technology.tabula.Utils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/12 14:00
+ * @description 自定义的文本去水印方法,发现水印基本是旋转文字并且比报告内其他文字都大;主要依据文本旋转角度和字体大小判断是否为水印
+ */
+public class CustomTabulaTextStripper extends TextStripper {
+    private static final String NBSP = "\u00A0";
+    private static final float AVG_HEIGHT_MULT_THRESHOLD = 6.0f;
+    private static final float MAX_BLANK_FONT_SIZE = 40.0f;
+    private static final float MIN_BLANK_FONT_SIZE = 2.0f;
+    private final PDDocument document;
+    private final ArrayList<TextElement> textElements;
+    private final RectangleSpatialIndex<TextElement> spatialIndex;
+    private float minCharWidth = Float.MAX_VALUE;
+    private float minCharHeight = Float.MAX_VALUE;
+    private float totalHeight = 0.0f;
+    private int countHeight = 0;
+
+    public CustomTabulaTextStripper(PDDocument document, int pageNumber) throws IOException {
+        super(document, pageNumber);
+        this.document = document;
+        this.setStartPage(pageNumber);
+        this.setEndPage(pageNumber);
+        this.textElements = new ArrayList<>();
+        this.spatialIndex = new RectangleSpatialIndex<>();
+    }
+
+    public void process() throws IOException {
+        this.getText(this.document);
+    }
+
+    @Override
+    protected void writeString(String string, List<TextPosition> textPositions) {
+        // 有旋转角度的文字
+        List<TextPosition> rotationTexts = textPositions.stream()
+                .filter(e -> e.getTextMatrix().getValue(0, 1) != 0.).collect(Collectors.toList());
+        // 水印文字基本都是有角度的,统计有旋转角度的文字高度
+        List<Float> heights = rotationTexts.stream().map(TextPosition::getHeight).collect(Collectors.toList());
+        // 如果全是水印文字则直接去除
+        if (textPositions.size() == heights.size()) {
+            return;
+        }
+
+        // 其他场景需要写TextElement属性
+        for (TextPosition textPosition : textPositions) {
+            if (textPosition == null) {
+                continue;
+            }
+
+            String c = textPosition.getUnicode();
+
+            // if c not printable, return
+            if (!isPrintable(c)) {
+                continue;
+            }
+
+            float h = textPosition.getHeightDir();
+
+            if (c.equals(NBSP)) { // replace non-breaking space for space
+                c = " ";
+            }
+
+            // 文字没有旋转角度,并且水印字体大小没有包含当前文字时说明是正常文字
+            float rotation = textPosition.getTextMatrix().getValue(0, 1);
+            if (rotation != 0. || heights.contains(h)) {
+                c = " ";
+            }
+
+            float wos = textPosition.getWidthOfSpace();
+
+            TextElement te = new TextElement(Utils.round(textPosition.getYDirAdj() - h, 2),
+                    Utils.round(textPosition.getXDirAdj(), 2), Utils.round(textPosition.getWidthDirAdj(), 2),
+                    Utils.round(textPosition.getHeightDir(), 2), textPosition.getFont(), textPosition.getFontSizeInPt(), c,
+                    // workaround a possible bug in PDFBox:
+                    // https://issues.apache.org/jira/browse/PDFBOX-1755
+                    wos, textPosition.getDir());
+
+            this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
+            this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());
+
+            countHeight++;
+            totalHeight += te.getHeight();
+            float avgHeight = totalHeight / countHeight;
+
+            //We have an issue where tall blank cells throw off the row height calculation
+            //Introspect a blank cell a bit here to see if it should be thrown away
+            if ((te.getText() == null || te.getText().trim().equals(""))) {
+                //if the cell height is more than AVG_HEIGHT_MULT_THRESHOLDxaverage, throw it away
+                if (avgHeight > 0
+                        && te.getHeight() >= (avgHeight * AVG_HEIGHT_MULT_THRESHOLD)) {
+                    continue;
+                }
+
+                //if the font size is outside of reasonable ranges, throw it away
+                if (textPosition.getFontSizeInPt() > MAX_BLANK_FONT_SIZE || textPosition.getFontSizeInPt() < MIN_BLANK_FONT_SIZE) {
+                    continue;
+                }
+            }
+
+            this.spatialIndex.add(te);
+            this.textElements.add(te);
+        }
+    }
+
+    @Override
+    protected float computeFontHeight(PDFont font) throws IOException {
+        BoundingBox bbox = font.getBoundingBox();
+        if (bbox.getLowerLeftY() < Short.MIN_VALUE) {
+            // PDFBOX-2158 and PDFBOX-3130
+            // files by Salmat eSolutions / ClibPDF Library
+            bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
+        }
+        // 1/2 the bbox is used as the height todo: why?
+        float glyphHeight = bbox.getHeight() / 2;
+
+        // sometimes the bbox has very high values, but CapHeight is OK
+        PDFontDescriptor fontDescriptor = font.getFontDescriptor();
+        if (fontDescriptor != null) {
+            float capHeight = fontDescriptor.getCapHeight();
+            if (Float.compare(capHeight, 0) != 0 &&
+                    (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
+                glyphHeight = capHeight;
+            }
+            // PDFBOX-3464, PDFBOX-448:
+            // sometimes even CapHeight has very high value, but Ascent and Descent are ok
+            float ascent = fontDescriptor.getAscent();
+            float descent = fontDescriptor.getDescent();
+            if (ascent > 0 && descent < 0 &&
+                    ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
+                glyphHeight = (ascent - descent) / 2;
+            }
+        }
+
+        // transformPoint from glyph space -> text space
+        float height;
+        if (font instanceof PDType3Font) {
+            height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
+        } else {
+            height = glyphHeight / 1000;
+        }
+
+        return height;
+    }
+
+    private boolean isPrintable(String s) {
+        char c;
+        Character.UnicodeBlock block;
+        boolean printable = false;
+        for (int i = 0; i < s.length(); i++) {
+            c = s.charAt(i);
+            block = Character.UnicodeBlock.of(c);
+            printable |= !Character.isISOControl(c) && block != null && block != Character.UnicodeBlock.SPECIALS;
+        }
+        return printable;
+    }
+
+    public List<TextElement> getTextElements() {
+        return this.textElements;
+    }
+
+    public RectangleSpatialIndex<TextElement> getSpatialIndex() {
+        return spatialIndex;
+    }
+
+    public float getMinCharWidth() {
+        return minCharWidth;
+    }
+
+    public float getMinCharHeight() {
+        return minCharHeight;
+    }
+}

+ 0 - 285
service-daq/src/main/java/com/simuwang/daq/components/PDMonthlyReportParser.java

@@ -1,285 +0,0 @@
-package com.simuwang.daq.components;
-
-import cn.hutool.core.collection.CollUtil;
-import cn.hutool.core.collection.ListUtil;
-import cn.hutool.core.map.MapUtil;
-import cn.hutool.core.util.ReflectUtil;
-import cn.hutool.core.util.StrUtil;
-import com.simuwang.base.common.exception.APIException;
-import com.simuwang.base.mapper.EmailFieldMappingMapper;
-import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
-import com.simuwang.daq.dto.MonthlyReportNavInfo;
-import com.simuwang.daq.dto.ReportFundInfo;
-import com.simuwang.daq.dto.ReportInfo;
-import com.smppw.common.pojo.ValueLabelVO;
-import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.springframework.stereotype.Component;
-import technology.tabula.*;
-import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
-
-import java.io.IOException;
-import java.util.Calendar;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-/**
- * @author wangzaijun
- * @date 2024/9/11 16:19
- * @description pdf格式的月报解析
- */
-@Component("monthly-report:pdf")
-public class PDMonthlyReportParser extends AbstractReportParser<MonthlyReportNavInfo> {
-    private final List<Table> extNavTables = ListUtil.list(true);
-    private final EmailFieldMappingMapper fieldMappingMapper;
-    private String reportName = null;
-    private Table baseInfoTable = null;
-    private List<ValueLabelVO> fieldMapper = null;
-
-    public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
-        this.fieldMappingMapper = fieldMappingMapper;
-    }
-
-    @Override
-    protected void initParse() throws IOException {
-        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(this.filepath))) {
-            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
-            stripper.setSortByPosition(true);
-            String text = stripper.getText(document);
-            text = text.replace("++\r\n", "").replace("++", "");
-            List<String> textList = StrUtil.split(text, "\r\n");
-            if (CollUtil.isNotEmpty(textList)) {
-                List<String> wkList = this.watermarkListMap.get("report_name");
-                String name = this.processString(wkList, textList.get(0));
-                this.reportName = this.matchReportName(name);
-                if (StrUtil.isBlank(this.reportName)) {
-                    throw new APIException("未匹配到报告名称");
-                }
-            }
-
-            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
-            PageIterator pageIterator = new ObjectExtractor(document).extract();
-            while (pageIterator.hasNext()) {
-                Page page = pageIterator.next();
-                List<Table> tables = extractionAlgorithm.extract(page);
-                tables = tables.stream().distinct().collect(Collectors.toList());
-                for (Table table : tables) {
-                    int colCount = table.getColCount();
-                    if (colCount == 4) {
-                        this.baseInfoTable = table;
-                    } else if (colCount >= 5) {
-                        this.extNavTables.add(table);
-                    }
-                }
-            }
-        }
-        List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping(1);
-        if (CollUtil.isNotEmpty(emailFieldMapping)) {
-            this.fieldMapper = emailFieldMapping.stream().map(e -> new ValueLabelVO(e.getCode(), e.getName())).collect(Collectors.toList());
-        }
-    }
-
-    @Override
-    protected ReportInfo parseReportInfo(Integer fileId) {
-        ReportInfo reportInfo = new ReportInfo();
-        reportInfo.setFileId(fileId);
-        reportInfo.setReportName(this.reportName);
-        reportInfo.setReportType(this.matchReportType(this.reportName));
-        reportInfo.setReportDate(this.matchReportDate(this.reportName));
-        return reportInfo;
-    }
-
-    @Override
-    protected ReportFundInfo parseBaseInfo() {
-        Table baseInfoTable = this.baseInfoTable;
-        if (baseInfoTable == null) {
-            throw new APIException("未解析到基本信息表格");
-        }
-        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
-        for (int i = 0; i < baseInfoTable.getRows().size(); i++) {
-            List<RectangularTextContainer> cols = baseInfoTable.getRows().get(i);
-            for (int j = 0; j < 2; j++) {
-                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
-            }
-        }
-        // 匹配字段清洗字段
-        ReportFundInfo reportFundInfo = new ReportFundInfo();
-        this.buildInfo(baseInfoMap, reportFundInfo);
-        return reportFundInfo;
-    }
-
-    @Override
-    protected List<MonthlyReportNavInfo> parseExtInfo() {
-        List<MonthlyReportNavInfo> exts = ListUtil.list(false);
-        List<Table> extNavTables = this.extNavTables;
-        for (Table extNavTable : extNavTables) {
-            Map<String, Object> extInfoMap = MapUtil.newHashMap(16);
-            for (int i = 0; i < extNavTable.getColCount(); i++) {
-                String key = extNavTable.getCell(0, i).getText();
-                String value = extNavTable.getCell(1, i).getText();
-                extInfoMap.put(key, value);
-            }
-            MonthlyReportNavInfo navInfo = new MonthlyReportNavInfo();
-            buildInfo(extInfoMap, navInfo);
-            exts.add(navInfo);
-        }
-        return exts;
-    }
-
-    private void buildInfo(Map<String, Object> extInfoMap, Object info) {
-        for (Map.Entry<String, Object> entry : extInfoMap.entrySet()) {
-            String k = entry.getKey();
-            Object v = entry.getValue();
-            String fieldValue = StrUtil.toStringOrNull(v);
-            if (fieldValue.startsWith("-") || fieldValue.endsWith("-")) {
-                fieldValue = null;
-            }
-            if (fieldValue != null) {
-                fieldValue = fieldValue.replace("\r", "");
-            }
-            for (ValueLabelVO vo : this.fieldMapper) {
-                String fieldName = vo.getValue();
-                List<String> labels = StrUtil.split(vo.getLabel(), ",");
-                if (labels.contains(k)) {
-                    try {
-                        ReflectUtil.setFieldValue(info, fieldName, fieldValue);
-                    } catch (Exception e) {
-                        this.logger.warn("{} 字段值设置错误:{}", fieldName, e.getMessage());
-                    }
-                    break;
-                }
-                for (String label : labels) {
-                    if (k.contains(label)) {
-                        try {
-                            ReflectUtil.setFieldValue(info, fieldName, fieldValue);
-                        } catch (Exception e) {
-                            this.logger.warn("{} 字段值设置错误:{}", fieldName, e.getMessage());
-                        }
-                        break;
-                    }
-                }
-            }
-        }
-    }
-
-    @Override
-    protected void saveResult(ReportInfo reportInfo, ReportFundInfo reportFundInfo, List<MonthlyReportNavInfo> exts) {
-        System.out.println("保存数据!");
-    }
-
-    /**
-     * 匹配报告日期
-     *
-     * @param string 文本内容
-     * @return 报告日期
-     */
-    private String matchReportDate(String string) {
-        if (string == null) {
-            return null;
-        }
-
-        // 编译正则表达式模式
-        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
-        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
-        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
-        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
-
-        // 创建Matcher对象
-        Matcher matcher1 = pat1.matcher(string);
-        Matcher matcher2 = pat2.matcher(string);
-        Matcher matcher3 = pat3.matcher(string);
-        Matcher matcher4 = pat4.matcher(string);
-
-        // 尝试匹配
-        if (matcher1.find()) {
-            String year = matcher1.group(1);
-            String quarter = matcher1.group(2);
-            return switch (quarter) {
-                case "一", "1" -> year + "-03-31";
-                case "二", "2" -> year + "-06-30";
-                case "三", "3" -> year + "-09-30";
-                case "四", "4" -> year + "-12-31";
-                default -> null;
-            };
-        } else if (matcher2.find()) {
-            return matcher2.group();
-        } else if (matcher3.find()) {
-            return matcher3.group(1) + "-12-31";
-        } else if (matcher4.find()) {
-            String year = matcher4.group(1);
-            String month = matcher4.group(2);
-            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
-            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
-        } else {
-            return null;
-        }
-    }
-
-    /**
-     * 匹配报告类型,如“季度”、“年度”
-     *
-     * @param string 输入字符串
-     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
-     */
-    private String matchReportType(String string) {
-        if (string == null) {
-            return null;
-        }
-
-        // 编译正则表达式模式
-        Pattern pattern = Pattern.compile("月|季度|年度");
-
-        // 创建Matcher对象
-        Matcher matcher = pattern.matcher(string);
-
-        // 尝试匹配
-        if (matcher.find()) {
-            return matcher.group();
-        } else {
-            return null;
-        }
-    }
-
-    private String matchReportName(String text) {
-        if (StrUtil.isBlank(text)) {
-            return null;
-        }
-        // 编译正则表达式模式
-        Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
-        Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
-        Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
-
-        // 创建Matcher对象
-        Matcher matcher1 = pat1.matcher(text);
-        Matcher matcher2 = pat2.matcher(text);
-        Matcher matcher3 = pat3.matcher(text);
-
-        // 尝试匹配
-        String reportName;
-        if (matcher1.find()) {
-            reportName = matcher1.group();
-        } else if (matcher2.find()) {
-            reportName = matcher2.group();
-        } else if (matcher3.find()) {
-            reportName = matcher3.group();
-        } else {
-            reportName = text;
-        }
-        return reportName.replace("(", "(").replace(")", ")");
-    }
-
-    private int getLastDayOfMonth(int year, int month) {
-        Calendar calendar = Calendar.getInstance();
-        calendar.set(Calendar.YEAR, year);
-        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
-        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
-    }
-
-    private String padZero(String number) {
-        return String.format("%02d", Integer.parseInt(number));
-    }
-}

+ 10 - 9
service-daq/src/main/java/com/simuwang/daq/components/PythonReportConverter.java

@@ -7,6 +7,7 @@ import cn.hutool.core.util.StrUtil;
 import cn.hutool.json.JSONArray;
 import cn.hutool.json.JSONObject;
 import cn.hutool.json.JSONUtil;
+import com.simuwang.base.common.enums.ReportType;
 import com.simuwang.base.pojo.dos.report.BaseReportDO;
 import com.simuwang.base.pojo.dto.report.*;
 
@@ -21,8 +22,8 @@ import java.util.Set;
  */
 public class PythonReportConverter {
     @SuppressWarnings("unchecked")
-    public static <T extends ReportData> PythonResult<T> convert(JSONObject jsonObject, Integer type) {
-        PythonResult<T> result = new PythonResult<>();
+    public static <T extends ReportData> ParseResult<T> convert(JSONObject jsonObject, ReportType type) {
+        ParseResult<T> result = new ParseResult<>();
         if (jsonObject == null) {
             return result;
         }
@@ -34,7 +35,7 @@ public class PythonReportConverter {
         }
 
         T reportData;
-        if (Objects.equals(2, type) || Objects.equals(1, type)) {
+        if (Objects.equals(ReportType.ANNUALLY, type) || Objects.equals(ReportType.QUARTERLY, type)) {
             reportData = (T) convertQuarterly(data);
         } else {
             reportData = (T) convertMonthly(data);
@@ -44,17 +45,17 @@ public class PythonReportConverter {
     }
 
     private static MonthlyReportData convertMonthly(JSONObject jsonObject) {
-        MonthlyReportData reportData = new MonthlyReportData();
-        reportData.setBaseInfo(convertToObj(jsonObject, "base_info", ReportBaseInfoDTO.class));
-        reportData.setFundInfo(convertToObj(jsonObject, "fund_info", ReportFundInfoDTO.class));
+        ReportBaseInfoDTO baseInfo = convertToObj(jsonObject, "base_info", ReportBaseInfoDTO.class);
+        ReportFundInfoDTO fundInfo = convertToObj(jsonObject, "fund_info", ReportFundInfoDTO.class);
+        MonthlyReportData reportData = new MonthlyReportData(baseInfo, fundInfo);
         reportData.setNetReport(convertToList(jsonObject, "net_report", ReportNetReportDTO.class));
         return reportData;
     }
 
     private static QuarterlyReportData convertQuarterly(JSONObject jsonObject) {
-        QuarterlyReportData reportData = new QuarterlyReportData();
-        reportData.setBaseInfo(convertToObj(jsonObject, "base_info", ReportBaseInfoDTO.class));
-        reportData.setFundInfo(convertToObj(jsonObject, "fund_info", ReportFundInfoDTO.class));
+        ReportBaseInfoDTO baseInfo = convertToObj(jsonObject, "base_info", ReportBaseInfoDTO.class);
+        ReportFundInfoDTO fundInfo = convertToObj(jsonObject, "fund_info", ReportFundInfoDTO.class);
+        QuarterlyReportData reportData = new QuarterlyReportData(baseInfo, fundInfo);
         reportData.setAssetAllocation(convertToList(jsonObject, "asset_allocation", ReportAssetAllocationDTO.class));
         reportData.setFinancialIndicators(convertToList(jsonObject, "financial_indicators", ReportFinancialIndicatorsDTO.class));
         reportData.setInvestmentIndustry(convertToList(jsonObject, "investment_industry", ReportInvestmentIndustryDTO.class));

+ 0 - 18
service-daq/src/main/java/com/simuwang/daq/components/ReportParser.java

@@ -1,18 +0,0 @@
-package com.simuwang.daq.components;
-
-/**
- * @author wangzaijun
- * @date 2024/9/9 19:18
- * @description 报告模板解析器,计划支持pdf、word等
- */
-public interface ReportParser {
-    /**
-     * 报告模板解析接口
-     * 扩展支持月报、季报和年报,解析文件格式支持pdf、word和excel
-     *
-     * @param fileId        文件id
-     * @param filepath      文件路径
-     * @param watermarkName 生成水印
-     */
-    void parse(Integer fileId, String filepath, String watermarkName);
-}

+ 117 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java

@@ -0,0 +1,117 @@
+package com.simuwang.daq.components.report.parser;
+
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.ReflectUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
+import com.simuwang.base.pojo.dto.report.ReportData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.regex.Pattern;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/30 18:13
+ * @description 非python接口的报告解析抽象(主要是支持pdf、word和excel等格式)
+ */
+public abstract class AbstractReportParser<T extends ReportData> implements ReportParser<T> {
+    protected final Logger logger = LoggerFactory.getLogger(this.getClass());
+
+    private final EmailFieldMappingMapper fieldMappingMapper;
+    /**
+     * 字段匹配规则
+     */
+    protected Map<String, String> fieldMapper;
+
+    public AbstractReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        this.fieldMappingMapper = fieldMappingMapper;
+        this.fieldMapper = MapUtil.newHashMap(128);
+    }
+
+    /**
+     * 初始化数据的方法
+     */
+    protected void init() {
+        List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping(3);
+        if (CollUtil.isEmpty(emailFieldMapping)) {
+            this.logger.error("未设置报告解析规则!");
+            return;
+        }
+        for (EmailFieldMappingDO mapping : emailFieldMapping) {
+            String code = mapping.getCode();
+            List<String> names = StrUtil.split(mapping.getName(), ",");
+            for (String name : names) {
+                this.fieldMapper.putIfAbsent(name, code);
+            }
+        }
+    }
+
+    /**
+     * 数据清洗,默认啥也不做
+     *
+     * @param reportData 结果数据
+     */
+    protected abstract void cleaningReportData(T reportData);
+
+    /**
+     * 对象字段设置
+     *
+     * @param extInfoMap 名称与值的对应关系
+     * @param info       待设置的对象
+     */
+    protected void buildInfo(Map<String, Object> extInfoMap, Object info) {
+        if (MapUtil.isEmpty(extInfoMap)) {
+            return;
+        }
+        for (Map.Entry<String, Object> entry : extInfoMap.entrySet()) {
+            String k = this.cleaningValue(entry.getKey());
+            String fieldValue = this.cleaningValue(entry.getValue());
+            String fieldName = this.fieldMapper.get(k);
+            if (StrUtil.isBlank(fieldName)) {
+                continue;
+            }
+            try {
+                ReflectUtil.setFieldValue(info, fieldName, fieldValue);
+            } catch (Exception e) {
+                this.logger.warn("{} 字段值设置错误:{}", fieldName, e.getMessage());
+            }
+        }
+    }
+
+    protected String cleaningValue(Object value) {
+        return this.cleaningValue(value, true);
+    }
+
+    /**
+     * 数据简单清洗,并全部转为字符串类型
+     *
+     * @param value              待清洗的数据
+     * @param replaceParentheses 是否替换圆括号
+     * @return /
+     */
+    protected String cleaningValue(Object value, boolean replaceParentheses) {
+        String fieldValue = StrUtil.toStringOrNull(value);
+        if (!StrUtil.isNullOrUndefined(fieldValue)) {
+            // 特殊字符替换,空格替换为空字符
+            fieldValue = fieldValue
+                    .replace("\r", StrUtil.EMPTY)
+                    .replace(";", ";")
+                    .replaceAll(" ", StrUtil.EMPTY);
+            if (replaceParentheses) {
+                // 正则表达式匹配中文括号及其内容,并替换为空字符串
+                fieldValue = Pattern.compile("[(|(][^)]*[)|)]").matcher(fieldValue).replaceAll(StrUtil.EMPTY);
+            }
+        }
+        // 如果仅有 “-” 该字段值为null
+        if (Objects.equals("-", fieldValue)) {
+            fieldValue = null;
+        }
+        return StrUtil.isBlank(fieldValue) ? null : fieldValue;
+    }
+}

+ 33 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParser.java

@@ -0,0 +1,33 @@
+package com.simuwang.daq.components.report.parser;
+
+import com.simuwang.base.common.exception.ReportParseException;
+import com.simuwang.base.pojo.dto.report.ReportData;
+import com.simuwang.base.pojo.dto.report.ReportParserParams;
+
+import java.io.IOException;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/9 19:18
+ * @description 报告模板解析器,计划支持pdf、word等
+ */
+public interface ReportParser<T extends ReportData> {
+    /**
+     * 获取当前解析器名称
+     *
+     * @return /
+     */
+    default String getParser() {
+        return this.getClass().getSimpleName();
+    }
+
+    /**
+     * 报告模板解析接口
+     * 扩展支持月报、季报和年报,解析文件格式支持pdf、word和excel
+     *
+     * @param params 解析请求参数
+     * @return 解析结果
+     * @throws IOException 文件io异常
+     */
+    T parse(ReportParserParams params) throws IOException, ReportParseException;
+}

+ 69 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParserConstant.java

@@ -0,0 +1,69 @@
+package com.simuwang.daq.components.report.parser;
+
+import cn.hutool.core.map.MapUtil;
+import com.simuwang.base.common.enums.ReportParserFileType;
+import com.simuwang.base.common.enums.ReportType;
+
+import java.util.Map;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/29 13:39
+ * @description 报告解析的bean名称关系配置
+ */
+public final class ReportParserConstant {
+    public static final Map<ReportType, Map<ReportParserFileType, String>> REPORT_PARSER_BEAN_MAP = MapUtil.newHashMap(8);
+
+    public static final String PARSER_PDF_MONTHLY = "report-parser:pdf:monthly";
+    public static final String PARSER_DOC_MONTHLY = "report-parser:doc:monthly";
+    public static final String PARSER_DOCX_MONTHLY = "report-parser:docx:monthly";
+    public static final String PARSER_XLSX_MONTHLY = "report-parser:xlsx:monthly";
+    public static final String PARSER_XLS_MONTHLY = "report-parser:xls:monthly";
+    public static final String PARSER_PYTHON_MONTHLY = "report-parser:python:monthly";
+
+    public static final String PARSER_PDF_QUARTERLY = "report-parser:pdf:quarterly";
+    public static final String PARSER_DOC_QUARTERLY = "report-parser:doc:quarterly";
+    public static final String PARSER_DOCX_QUARTERLY = "report-parser:docx:quarterly";
+    public static final String PARSER_XLSX_QUARTERLY = "report-parser:xlsx:quarterly";
+    public static final String PARSER_XLS_QUARTERLY = "report-parser:xls:quarterly";
+    public static final String PARSER_PYTHON_QUARTERLY = "report-parser:python:quarterly";
+
+    public static final String PARSER_PDF_ANNUALLY = "report-parser:pdf:annually";
+    public static final String PARSER_DOC_ANNUALLY = "report-parser:doc:annually";
+    public static final String PARSER_DOCX_ANNUALLY = "report-parser:docx:annually";
+    public static final String PARSER_XLSX_ANNUALLY = "report-parser:xlsx:annually";
+    public static final String PARSER_XLS_ANNUALLY = "report-parser:xls:annually";
+    public static final String PARSER_PYTHON_ANNUALLY = "report-parser:python:annually";
+
+    static {
+        REPORT_PARSER_BEAN_MAP.put(ReportType.MONTHLY,
+                Map.of(ReportParserFileType.PDF, PARSER_PDF_MONTHLY,
+                        ReportParserFileType.DOC, PARSER_DOC_MONTHLY,
+                        ReportParserFileType.DOCX, PARSER_DOCX_MONTHLY,
+                        ReportParserFileType.XLSX, PARSER_XLSX_MONTHLY,
+                        ReportParserFileType.XLS, PARSER_XLS_MONTHLY,
+
+                        ReportParserFileType.PYTHON, PARSER_PYTHON_MONTHLY
+                ));
+
+        REPORT_PARSER_BEAN_MAP.put(ReportType.QUARTERLY,
+                Map.of(ReportParserFileType.PDF, PARSER_PDF_QUARTERLY,
+                        ReportParserFileType.DOC, PARSER_DOC_QUARTERLY,
+                        ReportParserFileType.DOCX, PARSER_DOCX_QUARTERLY,
+                        ReportParserFileType.XLSX, PARSER_XLSX_QUARTERLY,
+                        ReportParserFileType.XLS, PARSER_XLS_QUARTERLY,
+
+                        ReportParserFileType.PYTHON, PARSER_PYTHON_QUARTERLY
+                ));
+
+        REPORT_PARSER_BEAN_MAP.put(ReportType.ANNUALLY,
+                Map.of(ReportParserFileType.PDF, PARSER_PDF_ANNUALLY,
+                        ReportParserFileType.DOC, PARSER_DOC_ANNUALLY,
+                        ReportParserFileType.DOCX, PARSER_DOCX_ANNUALLY,
+                        ReportParserFileType.XLSX, PARSER_XLSX_ANNUALLY,
+                        ReportParserFileType.XLS, PARSER_XLS_ANNUALLY,
+
+                        ReportParserFileType.PYTHON, PARSER_PYTHON_ANNUALLY
+                ));
+    }
+}

+ 32 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParserFactory.java

@@ -0,0 +1,32 @@
+package com.simuwang.daq.components.report.parser;
+
+import cn.hutool.core.map.MapUtil;
+import com.simuwang.base.common.enums.ReportParserFileType;
+import com.simuwang.base.common.enums.ReportType;
+import com.simuwang.base.common.exception.ReportParseException;
+import com.simuwang.base.pojo.dto.report.ReportData;
+import com.simuwang.base.pojo.dto.report.ReportParseStatus;
+import org.springframework.stereotype.Component;
+
+import java.util.Map;
+
+@Component
+public class ReportParserFactory {
+    private static final ReportParser<? extends ReportData> DEFAULT = (ReportParser<ReportData>) params -> null;
+
+    private static final Map<String, ReportParser<? extends ReportData>> REPORT_WRITER_MAP = MapUtil.newHashMap(32);
+
+    public ReportParserFactory(Map<String, ReportParser<? extends ReportData>> components) {
+        REPORT_WRITER_MAP.putAll(components);
+    }
+
+    @SuppressWarnings("unchecked")
+    public <T extends ReportData> ReportParser<T> getInstance(ReportType reportType, ReportParserFileType reportParserFileType) {
+        String beanName = ReportParserConstant.REPORT_PARSER_BEAN_MAP.getOrDefault(reportType, MapUtil.empty()).get(reportParserFileType);
+        ReportParser<? extends ReportData> reportParser = REPORT_WRITER_MAP.get(beanName);
+        if (reportParser == null) {
+            throw new ReportParseException(ReportParseStatus.NO_SUPPORT_TEMPLATE);
+        }
+        return (ReportParser<T>) reportParser;
+    }
+}

+ 330 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java

@@ -0,0 +1,330 @@
+package com.simuwang.daq.components.report.parser.pdf;
+
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.exceptions.ExceptionUtil;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.conts.Constants;
+import com.simuwang.base.common.enums.ReportType;
+import com.simuwang.base.common.exception.ReportParseException;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.CustomPDFTextStripper;
+import com.simuwang.daq.components.report.parser.AbstractReportParser;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import technology.tabula.CustomObjectExtractor;
+import technology.tabula.Page;
+import technology.tabula.PageIterator;
+import technology.tabula.Table;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+
+import java.io.IOException;
+import java.util.Calendar;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/29 16:45
+ * @description pdf格式的报告解析抽象类
+ */
+public abstract class AbstractPDReportParser<T extends ReportData> extends AbstractReportParser<T> {
+    /**
+     * 基金信息表格
+     */
+    protected Table fundInfoTable;
+    /**
+     * 去除了水印的所有文本内容
+     */
+    protected List<String> textList;
+
+    public AbstractPDReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public T parse(ReportParserParams params) throws IOException, ReportParseException {
+        // 先初始化为null
+        this.fundInfoTable = null;
+        this.textList = null;
+        // 初始化
+        this.init();
+        // 解析报告和表格
+        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
+            // 识别所有文字(去水印后的)
+            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
+            stripper.setSortByPosition(true);
+            String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, StrUtil.EMPTY);
+            this.textList = StrUtil.split(text, System.lineSeparator());
+            this.textList.removeIf(StrUtil::isBlank);
+            if (this.textList.isEmpty()) {
+                throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN);
+            }
+            // 报告名称和类型一般在第一第二行
+            if (this.matchReportType(this.textList.get(0)) == null && this.matchReportType(this.textList.get(1)) == null) {
+                throw new ReportParseException(ReportParseStatus.NOT_A_REPORT);
+            }
+            // 解析所有表格(单元格字符去水印)
+            List<Table> tables = ListUtil.list(true);
+            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
+            // 自定义表格提取工具,去除单元格中的水印文字
+            PageIterator pageIterator = new CustomObjectExtractor(document).extract();
+            while (pageIterator.hasNext()) {
+                Page page = pageIterator.next();
+                tables.addAll(extractionAlgorithm.extract(page));
+            }
+            if (tables.isEmpty()) {
+                throw new ReportParseException(ReportParseStatus.REPORT_IS_SCAN);
+            }
+            this.initTableInfo(tables);
+        }
+        try {
+            // 报告基本信息
+            ReportBaseInfoDTO reportInfo = this.buildReportInfo(params);
+            // 解析报告中主体基金的基本信息
+            ReportFundInfoDTO reportFundInfo = this.buildFundInfo(params);
+            // 解析其他表格信息并且设置结果字段
+            T reportData = this.parseExtInfoAndSetData(reportInfo, reportFundInfo);
+            // 数据清洗后返回
+            this.cleaningReportData(reportData);
+            return reportData;
+        } catch (ReportParseException e) {
+            throw e;
+        } catch (Exception e) {
+            this.logger.warn("报告解析错误:{}", ExceptionUtil.stacktraceToString(e));
+            throw new ReportParseException(ReportParseStatus.NOT_A_FIXED_FORMAT);
+        }
+    }
+
+    /**
+     * 初始化解析所有表格数据
+     *
+     * @param tables 按固定的表格模式划分到不同的对象中
+     */
+    protected abstract void initTableInfo(List<Table> tables);
+
+    /**
+     * 绑定基金基本信息(年报的基金基本信息解析逻辑要覆盖重写)
+     *
+     * @param params /
+     * @return /
+     */
+    protected ReportFundInfoDTO buildFundInfo(ReportParserParams params) {
+        Table fundInfoTable = this.fundInfoTable;
+        if (fundInfoTable == null) {
+            throw new ReportParseException(ReportParseStatus.PARSE_FUND_INFO_FAIL);
+        }
+        // 基金基本信息映射
+        return this.buildDto(params.getFileId(), fundInfoTable, ReportFundInfoDTO.class, this::parseFundInfo);
+    }
+
+    /**
+     * 解析基金基本信息表格
+     *
+     * @param fundInfoTable 表格
+     * @return /
+     */
+    protected abstract Map<String, Object> parseFundInfo(Table fundInfoTable);
+
+    /**
+     * 解析报告的其他信息并设置到对象中
+     *
+     * @param reportInfo 报告基本信息
+     * @param fundInfo   报告中基金基本信息
+     * @return /
+     */
+    protected abstract T parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo);
+
+    @Override
+    protected void cleaningReportData(T reportData) {
+        // cleaning.
+    }
+
+    /**
+     * 构建报告基本信息
+     *
+     * @param params /
+     * @return /
+     */
+    private ReportBaseInfoDTO buildReportInfo(ReportParserParams params) {
+        Integer fileId = params.getFileId();
+        String reportName = params.getFilename();
+        ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO(fileId);
+        reportInfo.setReportName(reportName);
+        reportInfo.setReportType(this.matchReportType(reportName));
+        reportInfo.setReportDate(this.matchReportDate(reportName));
+        return reportInfo;
+    }
+
+    /**
+     * 构建只有两列表格的dto数据对象,如果有分级基金时
+     *
+     * @param <DTO>    泛型对象
+     * @param fileId   文件id
+     * @param tables   表格
+     * @param clazz    泛型对象
+     * @param function 表格转换的函数
+     * @return /
+     */
+    protected <DTO extends BaseReportLevelDTO<?>> List<DTO> buildLevelDto(Integer fileId, List<Table> tables, Class<DTO> clazz,
+                                                                          Function<Table, Map<String, Object>> function) {
+        // 映射转换
+        List<DTO> dtos = tables.stream().filter(Objects::nonNull)
+                .map(e -> this.buildDto(fileId, e, clazz, function)).collect(Collectors.toList());
+        // 分级基金匹配
+        List<String> levels = this.matchTieredFund(String.join(",", this.textList));
+        levels.add(0, "母基金");
+        for (int i = 0; i < dtos.size(); i++) {
+            if (levels.size() <= i) {
+                continue;
+            }
+            dtos.get(i).setLevel(levels.get(i));
+        }
+        return dtos;
+    }
+
+    /**
+     * 构建只有两列表格的dto数据对象
+     *
+     * @param <DTO>    泛型对象
+     * @param fileId   文件id
+     * @param table    表格
+     * @param clazz    泛型对象
+     * @param function 表格转换的函数
+     * @return /
+     */
+    private <DTO extends BaseReportDTO<?>> DTO buildDto(Integer fileId, Table table, Class<DTO> clazz,
+                                                        Function<Table, Map<String, Object>> function) {
+        try {
+            Map<String, Object> extInfoMap = function == null ? MapUtil.empty() : function.apply(table);
+            DTO dto = clazz.getDeclaredConstructor().newInstance();
+            dto.setFileId(fileId);
+            this.buildInfo(extInfoMap, dto);
+            return dto;
+        } catch (Exception ignored) {
+        }
+        return null;
+    }
+
+    /**
+     * 匹配分级基金名称
+     *
+     * @param text 文本内容
+     * @return /
+     */
+    protected List<String> matchTieredFund(String text) {
+        List<String> matches = ListUtil.list(false);
+        if (StrUtil.isBlank(text)) {
+            return matches;
+        }
+        // 使用正则表达式查找匹配项
+        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
+        Matcher matcher = pattern.matcher(text);
+        // 收集所有匹配项
+        while (matcher.find()) {
+            matches.add(matcher.group());
+        }
+        // 提取字母并按字母顺序排序
+        return matches.stream()
+                .map(s -> s.replaceAll("[^A-F]", ""))
+                .distinct()
+                .sorted()
+                .map(letter -> letter + "级")
+                .collect(Collectors.toList());
+    }
+
+    /**
+     * 匹配报告日期
+     *
+     * @param string 文本内容
+     * @return 报告日期
+     */
+    private String matchReportDate(String string) {
+        if (string == null) {
+            return null;
+        }
+        // 编译正则表达式模式
+        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
+        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
+        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
+        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
+        Pattern pat5 = Pattern.compile("\\d{4}\\d{2}\\d{2}");  // 20231231
+        Pattern pat6 = Pattern.compile("(2\\d{3})年度");  // 2023年度
+        // 创建Matcher对象
+        Matcher matcher1 = pat1.matcher(string);
+        Matcher matcher2 = pat2.matcher(string);
+        Matcher matcher3 = pat3.matcher(string);
+        Matcher matcher4 = pat4.matcher(string);
+        Matcher matcher5 = pat5.matcher(string);
+        Matcher matcher6 = pat6.matcher(string);
+        // 尝试匹配
+        if (matcher1.find()) {
+            String year = matcher1.group(1);
+            String quarter = matcher1.group(2);
+            return switch (quarter) {
+                case "一", "1" -> year + "-03-31";
+                case "二", "2" -> year + "-06-30";
+                case "三", "3" -> year + "-09-30";
+                case "四", "4" -> year + "-12-31";
+                default -> null;
+            };
+        } else if (matcher2.find()) {
+            return matcher2.group();
+        } else if (matcher5.find()) {
+            return matcher5.group();
+        } else if (matcher3.find()) {
+            return matcher3.group(1) + "-12-31";
+        } else if (matcher6.find()) {
+            return matcher6.group(1) + "-12-31";
+        } else if (matcher4.find()) {
+            String year = matcher4.group(1);
+            String month = matcher4.group(2);
+            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
+            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
+        } else {
+            return null;
+        }
+    }
+
+    /**
+     * 匹配报告类型,如“季度”、“年度”
+     *
+     * @param string 输入字符串
+     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
+     */
+    private String matchReportType(String string) {
+        if (string == null) {
+            return null;
+        }
+        // 所有报告的正则识别方式
+        String patterns = ReportType.getAllPatterns();
+        // 编译正则表达式模式
+        Pattern pattern = Pattern.compile(patterns);
+        // 创建Matcher对象
+        Matcher matcher = pattern.matcher(string);
+        // 尝试匹配
+        if (matcher.find()) {
+            return matcher.group();
+        } else {
+            return null;
+        }
+    }
+
+    private int getLastDayOfMonth(int year, int month) {
+        Calendar calendar = Calendar.getInstance();
+        calendar.set(Calendar.YEAR, year);
+        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
+        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
+    }
+
+    private String padZero(String number) {
+        return String.format("%02d", Integer.parseInt(number));
+    }
+}

+ 156 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java

@@ -0,0 +1,156 @@
+package com.simuwang.daq.components.report.parser.pdf;
+
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+import technology.tabula.Table;
+
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+
+/**
+ * @author wangzaijun
+ * @date 2024/10/10 17:34
+ * @description 年报解析逻辑:基本信息被拆分为多个表格,财务报表未解析
+ */
+@Component(ReportParserConstant.PARSER_PDF_ANNUALLY)
+public class PDAnnuallyReportParser extends PDQuarterlyReportParser<AnnuallyReportData> {
+    private static final List<String> FINANCIAL_INDICATORS_COLUMN_NAMES = ListUtil.list(false);
+
+    static {
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末基金净资产");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("报告期期末单位净值");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期利润");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("本期已实现收益");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配利润");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("期末可供分配基金份额利润");
+        FINANCIAL_INDICATORS_COLUMN_NAMES.add("基金份额累计净值增长率");
+    }
+
+    private List<Table> fundInfoTables;
+
+    public PDAnnuallyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public String getParser() {
+        return ReportParserConstant.PARSER_PDF_ANNUALLY;
+    }
+
+    @Override
+    protected void initTableInfo(List<Table> tables) {
+        // 初始化
+        this.fundInfoTables = ListUtil.list(true);
+        this.financialIndicatorsTables = ListUtil.list(true);
+        this.shareChangeTables = ListUtil.list(true);
+        this.assetAllocationTables = ListUtil.list(true);
+        this.investmentIndustryTables = ListUtil.list(true);
+        for (int i = 0; i < tables.size(); i++) {
+            Table table = tables.get(i);
+            if (i <= 1) {
+                this.fundInfoTables.add(table);
+                continue;
+            }
+            // 用表格的第一列的数据判断是否主要财务指标数据
+            List<String> texts = this.getTableColTexts(table, 0);
+            if (CollUtil.containsAny(texts, FINANCIAL_INDICATORS_COLUMN_NAMES)) {
+                this.financialIndicatorsTables.add(table);
+                continue;
+            }
+            int colCount = table.getColCount();
+            if (colCount == 2) {
+                // 用表格的第一列的数据判断是否份额变动记录
+                if (CollUtil.containsAny(texts, SHARE_CHANGE_COLUMN_NAMES)) {
+                    this.shareChangeTables.add(table);
+                }
+            } else if (colCount == 4) {
+                // 用表格的第二列的数据判断是否行业配置数据(内地)
+                texts = this.getTableColTexts(table, 1);
+                if (CollUtil.containsAny(texts, INDUSTRY_COLUMN_NAMES)) {
+                    this.investmentIndustryTables.add(table);
+                }
+            } else if (colCount == 3) {
+                // 用表格的第一列的数据判断是否行业配置数据(港股通)
+                if (CollUtil.containsAny(texts, INDUSTRY_COLUMN_NAMES)) {
+                    this.investmentIndustryTables.add(table);
+                    continue;
+                }
+                // 资产配置表格识别(兼容跨页的表格)获取表格中第二列的所有文字,判断所有文字中包含"股权投资"等字符串
+                texts = this.getTableColTexts(table, 1);
+                if (CollUtil.containsAny(texts, ListUtil.of("股权投资", "股票投资", "债券投资", "另类投资", "其他资产", "其他融资总额"))) {
+                    this.assetAllocationTables.add(table);
+                }
+            }
+        }
+    }
+
+    @Override
+    protected ReportFundInfoDTO buildFundInfo(ReportParserParams params) {
+        Map<String, Object> fundInfoMap = MapUtil.newHashMap(32);
+        for (Table table : this.fundInfoTables) {
+            Map<String, Object> temp = this.parseFundInfo(table);
+            fundInfoMap.putAll(temp);
+        }
+        ReportFundInfoDTO info = new ReportFundInfoDTO(params.getFileId());
+        this.buildInfo(fundInfoMap, info);
+        return info;
+    }
+
+    @Override
+    protected AnnuallyReportData buildExtData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo,
+                                              List<ReportShareChangeDTO> shareChanges,
+                                              List<ReportAssetAllocationDTO> assetAllocations,
+                                              List<ReportInvestmentIndustryDTO> investmentIndustries,
+                                              Function<Table, Map<String, Object>> function) {
+        // 处理财务指标
+        List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildFinancialIndicatorsInfo(reportInfo.getFileId());
+        // 返回数据构建
+        AnnuallyReportData reportData = new AnnuallyReportData(reportInfo, fundInfo);
+        reportData.setShareChange(shareChanges);
+        reportData.setFinancialIndicators(financialIndicators);
+        reportData.setAssetAllocation(assetAllocations);
+        reportData.setInvestmentIndustry(investmentIndustries);
+        return reportData;
+    }
+
+    @Override
+    protected void cleaningReportData(AnnuallyReportData reportData) {
+        // todo 数据清洗
+    }
+
+    private List<ReportFinancialIndicatorsDTO> buildFinancialIndicatorsInfo(Integer fileId) {
+        List<ReportFinancialIndicatorsDTO> dtos = ListUtil.list(false);
+        // 分级基金
+        List<String> levels = this.matchTieredFund(String.join(",", this.textList));
+        levels.add(0, "母基金");
+        // 假设这里可能存在分级基金,不存在表格跨页
+        for (int k = 0; k < this.financialIndicatorsTables.size(); k++) {
+            Table table = this.financialIndicatorsTables.get(k);
+            int colCount = table.getColCount();
+            for (int j = 1; j < colCount; j++) {
+                Map<String, Object> infoMap = MapUtil.newHashMap(16);
+                String year = this.cleaningValue(table.getCell(0, j).getText());
+                infoMap.put("年度", year);
+                for (int i = 0; i < table.getRowCount(); i++) {
+                    String columnName = this.cleaningValue(table.getCell(i, 0).getText());
+                    if (!CollUtil.contains(FINANCIAL_INDICATORS_COLUMN_NAMES, columnName)) {
+                        continue;
+                    }
+                    String value = this.cleaningValue(table.getCell(i, j).getText());
+                    infoMap.put(columnName, value);
+                }
+                ReportFinancialIndicatorsDTO dto = new ReportFinancialIndicatorsDTO(fileId);
+                this.buildInfo(infoMap, dto);
+                dto.setLevel(levels.get(k));
+                dtos.add(dto);
+            }
+        }
+        return dtos;
+    }
+}

+ 89 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDMonthlyReportParser.java

@@ -0,0 +1,89 @@
+package com.simuwang.daq.components.report.parser.pdf;
+
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.MonthlyReportData;
+import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
+import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
+import com.simuwang.base.pojo.dto.report.ReportNetReportDTO;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+import technology.tabula.RectangularTextContainer;
+import technology.tabula.Table;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/11 16:19
+ * @description pdf格式的月报解析
+ */
+@Component(ReportParserConstant.PARSER_PDF_MONTHLY)
+public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportData> {
+    private final List<Table> extNavTables = ListUtil.list(true);
+
+    public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public String getParser() {
+        return ReportParserConstant.PARSER_PDF_MONTHLY;
+    }
+
+    @Override
+    protected void initTableInfo(List<Table> tables) {
+        // 一般月报是固定的模板,4列表格是基金基本信息,其他5列的表格是月净值
+        for (Table table : tables) {
+            int colCount = table.getColCount();
+            int rowCount = table.getRowCount();
+            if (colCount == 0 && rowCount == 0) {
+                continue;
+            }
+            if (colCount == 4) {
+                this.fundInfoTable = table;
+            } else if (colCount >= 5) {
+                this.extNavTables.add(table);
+            }
+        }
+    }
+
+    @Override
+    protected Map<String, Object> parseFundInfo(Table fundInfoTable) {
+        // 月报的基金基本信息是四列的表格
+        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
+        for (int i = 0; i < fundInfoTable.getRows().size(); i++) {
+            @SuppressWarnings("all")
+            List<RectangularTextContainer> cols = fundInfoTable.getRows().get(i);
+            for (int j = 0; j < 2; j++) {
+                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
+            }
+        }
+        return baseInfoMap;
+    }
+
+    @Override
+    protected MonthlyReportData parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo) {
+        MonthlyReportData reportData = new MonthlyReportData(reportInfo, fundInfo);
+        // 母基金和分级基金的净值
+        List<ReportNetReportDTO> dtos = this.buildLevelDto(reportInfo.getFileId(), this.extNavTables,
+                ReportNetReportDTO.class, t -> {
+                    Map<String, Object> extInfoMap = MapUtil.newHashMap(16);
+                    for (int i = 0; i < t.getColCount(); i++) {
+                        String key = t.getCell(0, i).getText();
+                        String value = t.getCell(1, i).getText();
+                        extInfoMap.put(key, value);
+                    }
+                    return extInfoMap;
+                });
+        reportData.setNetReport(dtos);
+        return reportData;
+    }
+
+    @Override
+    protected void cleaningReportData(MonthlyReportData reportData) {
+        // todo 数据清洗
+    }
+}

+ 296 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java

@@ -0,0 +1,296 @@
+package com.simuwang.daq.components.report.parser.pdf;
+
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.*;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+import technology.tabula.RectangularTextContainer;
+import technology.tabula.Table;
+
+import java.awt.geom.Rectangle2D;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/29 17:53
+ * @description pdf格式的季报解析逻辑
+ */
+@Component(ReportParserConstant.PARSER_PDF_QUARTERLY)
+public class PDQuarterlyReportParser<T extends QuarterlyReportData> extends AbstractPDReportParser<T> {
+    protected static final List<String> INDUSTRY_COLUMN_NAMES = ListUtil.list(false);
+    protected static final List<String> SHARE_CHANGE_COLUMN_NAMES = ListUtil.list(false);
+
+    static {
+        // 中国证监会行业标准
+        INDUSTRY_COLUMN_NAMES.add("农、林、牧、渔业");
+        INDUSTRY_COLUMN_NAMES.add("采矿业");
+        INDUSTRY_COLUMN_NAMES.add("制造业");
+        INDUSTRY_COLUMN_NAMES.add("电力、热力、燃气及水生产和供应业");
+        INDUSTRY_COLUMN_NAMES.add("建筑业");
+        INDUSTRY_COLUMN_NAMES.add("批发和零售业");
+        INDUSTRY_COLUMN_NAMES.add("交通运输、仓储和邮政业");
+        INDUSTRY_COLUMN_NAMES.add("住宿和餐饮业");
+        INDUSTRY_COLUMN_NAMES.add("信息传输、软件和信息技术服务业");
+        INDUSTRY_COLUMN_NAMES.add("金融业");
+        INDUSTRY_COLUMN_NAMES.add("房地产业");
+        INDUSTRY_COLUMN_NAMES.add("租赁和商务服务业");
+        INDUSTRY_COLUMN_NAMES.add("科学研究和技术服务业");
+        INDUSTRY_COLUMN_NAMES.add("水利、环境和公共设施管理业");
+        INDUSTRY_COLUMN_NAMES.add("居民服务、修理和其他服务业");
+        INDUSTRY_COLUMN_NAMES.add("教育");
+        INDUSTRY_COLUMN_NAMES.add("卫生和社会工作");
+        INDUSTRY_COLUMN_NAMES.add("文化、体育和娱乐业");
+        INDUSTRY_COLUMN_NAMES.add("综合");
+
+        INDUSTRY_COLUMN_NAMES.add("港股通");
+
+        // 以下为国际标准
+        INDUSTRY_COLUMN_NAMES.add("能源");
+        INDUSTRY_COLUMN_NAMES.add("原材料");
+        INDUSTRY_COLUMN_NAMES.add("工业");
+        INDUSTRY_COLUMN_NAMES.add("非日常生活消费品");
+        INDUSTRY_COLUMN_NAMES.add("日常消费品");
+        INDUSTRY_COLUMN_NAMES.add("医疗保健");
+        INDUSTRY_COLUMN_NAMES.add("金融");
+        INDUSTRY_COLUMN_NAMES.add("信息技术");
+        INDUSTRY_COLUMN_NAMES.add("通讯服务");
+        INDUSTRY_COLUMN_NAMES.add("公用事业");
+        INDUSTRY_COLUMN_NAMES.add("房地产");
+
+        // 份额变动表格识别列
+        SHARE_CHANGE_COLUMN_NAMES.add("报告期期初基金份额总额");
+        SHARE_CHANGE_COLUMN_NAMES.add("减:报告期期间基金总赎回份额");
+        SHARE_CHANGE_COLUMN_NAMES.add("期末基金总份额/期末基金实缴总额");
+        SHARE_CHANGE_COLUMN_NAMES.add("报告期期间基金拆分变动份额");
+        SHARE_CHANGE_COLUMN_NAMES.add("报告期期间基金总申购份额");
+    }
+
+    protected List<Table> financialIndicatorsTables;
+    protected List<Table> shareChangeTables;
+    protected List<Table> assetAllocationTables;
+    protected List<Table> investmentIndustryTables;
+
+    public PDQuarterlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public String getParser() {
+        return ReportParserConstant.PARSER_PDF_QUARTERLY;
+    }
+
+    @Override
+    protected void initTableInfo(List<Table> tables) {
+        this.financialIndicatorsTables = ListUtil.list(true);
+        this.shareChangeTables = ListUtil.list(true);
+        this.assetAllocationTables = ListUtil.list(true);
+        this.investmentIndustryTables = ListUtil.list(true);
+        for (Table table : tables) {
+            int colCount = table.getColCount();
+            int rowCount = table.getRowCount();
+            if (colCount == 0 && rowCount == 0) {
+                continue;
+            }
+            if (rowCount == 13 && colCount == 2) {
+                this.fundInfoTable = table;
+            } else if (colCount == 2) {
+                // 用表格的第一列的数据判断是否份额变动记录
+                List<String> texts = this.getTableColTexts(table, 0);
+                // 主要财务指标或份额变动
+                if (CollUtil.containsAny(texts, SHARE_CHANGE_COLUMN_NAMES)) {
+                    this.shareChangeTables.add(table);
+                } else {
+                    this.financialIndicatorsTables.add(table);
+                }
+            } else if (colCount == 4) {
+                // 行业配置
+                this.investmentIndustryTables.add(table);
+            } else if (colCount == 3) {
+                // 用表格的第一列单元格判断是否资产配置表
+                List<String> texts = this.getTableColTexts(table, 0);
+                if (CollUtil.containsAny(texts, INDUSTRY_COLUMN_NAMES)) {
+                    this.investmentIndustryTables.add(table);
+                } else {
+                    this.assetAllocationTables.add(table);
+                }
+            }
+        }
+    }
+
+    @Override
+    protected Map<String, Object> parseFundInfo(Table fundInfoTable) {
+        // 季报和年报的基金基本信息是两列的表格
+        Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
+        for (int i = 0; i < fundInfoTable.getRows().size(); i++) {
+            @SuppressWarnings("all")
+            List<RectangularTextContainer> cols = fundInfoTable.getRows().get(i);
+            for (int j = 0; j < 1; j++) {
+                baseInfoMap.put(cols.get(j).getText(), cols.get(j + 1).getText());
+            }
+        }
+        return baseInfoMap;
+    }
+
+    @Override
+    protected T parseExtInfoAndSetData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo) {
+        Integer fileId = reportInfo.getFileId();
+        // 表格转换数据获取函数
+        Function<Table, Map<String, Object>> function = t -> {
+            Map<String, Object> extInfoMap = MapUtil.newHashMap(16);
+            for (int i = 0; i < t.getRowCount(); i++) {
+                String key = t.getCell(i, 0).getText();
+                String value = t.getCell(i, 1).getText();
+                extInfoMap.put(key, value);
+            }
+            return extInfoMap;
+        };
+        // 份额变动
+        List<ReportShareChangeDTO> shareChanges = this.buildLevelDto(fileId, this.shareChangeTables,
+                ReportShareChangeDTO.class, function);
+        // 主要财务指标
+        List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildLevelDto(fileId, this.financialIndicatorsTables,
+                ReportFinancialIndicatorsDTO.class, function);
+        // 资产配置
+        List<ReportAssetAllocationDTO> assetAllocations = this.buildAssetAllocationInfo(fileId);
+        // 行业配置
+        List<ReportInvestmentIndustryDTO> investmentIndustries = this.buildInvestmentIndustryInfo(fileId);
+        // 返回数据构建
+        QuarterlyReportData reportData = new QuarterlyReportData(reportInfo, fundInfo);
+        reportData.setShareChange(shareChanges);
+        reportData.setFinancialIndicators(financialIndicators);
+        reportData.setAssetAllocation(assetAllocations);
+        reportData.setInvestmentIndustry(investmentIndustries);
+        return this.buildExtData(reportInfo, fundInfo, shareChanges, assetAllocations, investmentIndustries, function);
+    }
+
+    protected T buildExtData(ReportBaseInfoDTO reportInfo, ReportFundInfoDTO fundInfo,
+                             List<ReportShareChangeDTO> shareChanges,
+                             List<ReportAssetAllocationDTO> assetAllocations,
+                             List<ReportInvestmentIndustryDTO> investmentIndustries,
+                             Function<Table, Map<String, Object>> function) {
+        Integer fileId = reportInfo.getFileId();
+        // 主要财务指标
+        List<ReportFinancialIndicatorsDTO> financialIndicators = this.buildLevelDto(fileId, this.financialIndicatorsTables,
+                ReportFinancialIndicatorsDTO.class, function);
+        QuarterlyReportData reportData = new QuarterlyReportData(reportInfo, fundInfo);
+        reportData.setShareChange(shareChanges);
+        reportData.setFinancialIndicators(financialIndicators);
+        reportData.setAssetAllocation(assetAllocations);
+        reportData.setInvestmentIndustry(investmentIndustries);
+        @SuppressWarnings("unchecked")
+        T t = (T) reportData;
+        return t;
+    }
+
+    @Override
+    protected void cleaningReportData(T reportData) {
+        // todo 数据清洗
+    }
+
+    /**
+     * 构建基金行业配置解析数据
+     *
+     * @return /
+     */
+    private List<ReportInvestmentIndustryDTO> buildInvestmentIndustryInfo(Integer fileId) {
+        List<ReportInvestmentIndustryDTO> dtos = ListUtil.list(false);
+        for (Table table : this.investmentIndustryTables) {
+            int colCount = table.getColCount();
+            // 投资地区: 1-境内, 2-港股通
+            int investType = colCount == 4 ? 1 : 2;
+            int j = colCount == 4 ? 1 : 0;
+            // 按行遍历
+            for (int i = 0; i < table.getRowCount(); i++) {
+                String text = this.cleaningValue(table.getCell(i, 0).getText());
+                if (StrUtil.containsAny(text, "序号", "行业类别")) {
+                    continue;
+                }
+                ReportInvestmentIndustryDTO dto = new ReportInvestmentIndustryDTO(fileId);
+                dto.setInvestType(investType);
+                dto.setIndustryName(this.cleaningValue(table.getCell(i, j).getText()));
+                dto.setMarketValue(this.cleaningValue(table.getCell(i, j + 1).getText()));
+                dto.setRatio(this.cleaningValue(table.getCell(i, j + 2).getText()));
+                dtos.add(dto);
+            }
+        }
+        return dtos;
+    }
+
+    /**
+     * 构建基金资产配置解析数据
+     *
+     * @param fileId 文件id
+     * @return /
+     */
+    private List<ReportAssetAllocationDTO> buildAssetAllocationInfo(Integer fileId) {
+        List<ReportAssetAllocationDTO> dtos = ListUtil.list(false);
+        String assetType = null;
+        for (Table table : this.assetAllocationTables) {
+            // 按行遍历
+            for (@SuppressWarnings("all") List<RectangularTextContainer> row : table.getRows()) {
+                // x坐标升序(防止部分行乱序问题)
+                row.sort(Comparator.comparing(Rectangle2D.Float::getX));
+                // 大类
+                String type = this.cleaningValue(row.get(0).getText());
+                if (StrUtil.isNotBlank(type)) {
+                    assetType = type;
+                }
+                // 金额、市值,有时是 “备注#金额”的格式
+                String marketValueAndRemark = this.cleaningValue(row.get(2).getText());
+                if (StrUtil.isBlank(marketValueAndRemark) || StrUtil.isBlank(assetType)) {
+                    continue;
+                }
+                // 资产明细
+                String detail = this.cleaningValue(row.get(1).getText(), false);
+                if (StrUtil.contains(marketValueAndRemark, "#")) {
+                    // 有#表示有备注,而且可能有多个,多个用分号分隔的.
+                    List<String> marketValueAndRemarks = StrUtil.split(marketValueAndRemark, ";");
+                    for (String mr : marketValueAndRemarks) {
+                        if (StrUtil.isBlank(mr)) {
+                            continue;
+                        }
+                        List<String> mrs = StrUtil.split(mr, "#");
+                        ReportAssetAllocationDTO dto = new ReportAssetAllocationDTO(fileId);
+                        dto.setAssetType(assetType);
+                        dto.setAssetDetails(detail);
+                        dto.setMarketValue(mrs.get(1));
+                        dto.setRemark(mrs.get(0));
+                        dtos.add(dto);
+                    }
+                } else {
+                    ReportAssetAllocationDTO dto = new ReportAssetAllocationDTO(fileId);
+                    dto.setAssetType(assetType);
+                    dto.setAssetDetails(detail);
+                    dto.setMarketValue(marketValueAndRemark);
+                    dtos.add(dto);
+                }
+            }
+        }
+        return dtos;
+    }
+
+    /**
+     * 获取表格指定列的所有文字内容
+     *
+     * @param table 表格
+     * @param col   指定列
+     * @return /
+     */
+    protected List<String> getTableColTexts(Table table, Integer col) {
+        List<String> details = ListUtil.list(false);
+        for (@SuppressWarnings("all") List<RectangularTextContainer> row : table.getRows()) {
+            String detail = this.cleaningValue(row.get(col).getText(), false);
+            if (StrUtil.isNotBlank(detail)) {
+                details.add(detail);
+            }
+        }
+        return details;
+    }
+}

+ 78 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/AbstractPyReportParser.java

@@ -0,0 +1,78 @@
+package com.simuwang.daq.components.report.parser.py;
+
+import cn.hutool.core.map.MapUtil;
+import cn.hutool.core.util.StrUtil;
+import cn.hutool.http.HttpUtil;
+import cn.hutool.json.JSONUtil;
+import com.simuwang.base.common.enums.ReportType;
+import com.simuwang.base.common.exception.ReportParseException;
+import com.simuwang.base.config.DaqProperties;
+import com.simuwang.base.mapper.FundInfoMapper;
+import com.simuwang.base.pojo.dos.FundAndCompanyInfoDO;
+import com.simuwang.base.pojo.dto.report.ParseResult;
+import com.simuwang.base.pojo.dto.report.ReportData;
+import com.simuwang.base.pojo.dto.report.ReportParseStatus;
+import com.simuwang.base.pojo.dto.report.ReportParserParams;
+import com.simuwang.daq.components.PythonReportConverter;
+import com.simuwang.daq.components.report.parser.ReportParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Objects;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/29 16:46
+ * @description python解析报告的抽象类
+ */
+public abstract class AbstractPyReportParser<T extends ReportData> implements ReportParser<T> {
+    protected final Logger logger = LoggerFactory.getLogger(this.getClass());
+
+    private final DaqProperties properties;
+    private final FundInfoMapper fundInfoMapper;
+
+    public AbstractPyReportParser(DaqProperties properties, FundInfoMapper fundInfoMapper) {
+        this.properties = properties;
+        this.fundInfoMapper = fundInfoMapper;
+    }
+
+    @Override
+    public T parse(ReportParserParams params) throws IOException, ReportParseException {
+        Boolean enablePyParser = this.properties.getEnablePyParser();
+        if (!enablePyParser) {
+            this.logger.error("The python report parser is unavailable!");
+            return null;
+        }
+        String pyBaseUrl = this.properties.getPyBaseUrl();
+        ReportType reportType = this.getReportType();
+        String registerNumber = params.getRegisterNumber();
+        String api = "/api/v1/parse/amac_report";
+        Map<String, Object> param = MapUtil.newHashMap(16);
+        param.put("file_id", params.getFileId());
+        param.put("file_path", params.getFilepath());
+        param.put("register_number", registerNumber);
+        param.put("file_type", reportType.getType());
+        param.put("file_name", params.getFilename());
+        if (StrUtil.isNotBlank(registerNumber)) {
+            FundAndCompanyInfoDO info = this.fundInfoMapper.queryFundAndTrustByRegisterNumber(registerNumber);
+            if (info != null) {
+                param.put("fund_name", info.getFundName());
+                param.put("trust_name", info.getCompanyName());
+            }
+        }
+        String body = HttpUtil.post(pyBaseUrl + api, JSONUtil.toJsonStr(params));
+        ParseResult<T> result = PythonReportConverter.convert(JSONUtil.parseObj(body), reportType);
+        if (result.getStatus() == null) {
+            throw new ReportParseException(ReportParseStatus.PARSE_FAIL, "资源文件不存在");
+        }
+        if (!Objects.equals(1, result.getStatus())) {
+            this.logger.error("报告{} 解析失败:{}", params, result.getMsg());
+            throw new ReportParseException(result.getStatus(), result.getMsg());
+        }
+        return result.getData();
+    }
+
+    protected abstract ReportType getReportType();
+}

+ 25 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/PythonAnnuallyReportParser.java

@@ -0,0 +1,25 @@
+package com.simuwang.daq.components.report.parser.py;
+
+import com.simuwang.base.common.enums.ReportType;
+import com.simuwang.base.config.DaqProperties;
+import com.simuwang.base.mapper.FundInfoMapper;
+import com.simuwang.base.pojo.dto.report.AnnuallyReportData;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+
+@Component(ReportParserConstant.PARSER_PYTHON_ANNUALLY)
+public class PythonAnnuallyReportParser extends AbstractPyReportParser<AnnuallyReportData> {
+    public PythonAnnuallyReportParser(DaqProperties properties, FundInfoMapper fundInfoMapper) {
+        super(properties, fundInfoMapper);
+    }
+
+    @Override
+    protected ReportType getReportType() {
+        return ReportType.ANNUALLY;
+    }
+
+    @Override
+    public String getParser() {
+        return ReportParserConstant.PARSER_PYTHON_ANNUALLY;
+    }
+}

+ 25 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/PythonMonthlyReportParser.java

@@ -0,0 +1,25 @@
+package com.simuwang.daq.components.report.parser.py;
+
+import com.simuwang.base.common.enums.ReportType;
+import com.simuwang.base.config.DaqProperties;
+import com.simuwang.base.mapper.FundInfoMapper;
+import com.simuwang.base.pojo.dto.report.MonthlyReportData;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+
+@Component(ReportParserConstant.PARSER_PYTHON_MONTHLY)
+public class PythonMonthlyReportParser extends AbstractPyReportParser<MonthlyReportData> {
+    public PythonMonthlyReportParser(DaqProperties properties, FundInfoMapper fundInfoMapper) {
+        super(properties, fundInfoMapper);
+    }
+
+    @Override
+    protected ReportType getReportType() {
+        return ReportType.MONTHLY;
+    }
+
+    @Override
+    public String getParser() {
+        return ReportParserConstant.PARSER_PYTHON_MONTHLY;
+    }
+}

+ 25 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/PythonQuarterlyReportParser.java

@@ -0,0 +1,25 @@
+package com.simuwang.daq.components.report.parser.py;
+
+import com.simuwang.base.common.enums.ReportType;
+import com.simuwang.base.config.DaqProperties;
+import com.simuwang.base.mapper.FundInfoMapper;
+import com.simuwang.base.pojo.dto.report.QuarterlyReportData;
+import com.simuwang.daq.components.report.parser.ReportParserConstant;
+import org.springframework.stereotype.Component;
+
+@Component(ReportParserConstant.PARSER_PYTHON_QUARTERLY)
+public class PythonQuarterlyReportParser extends AbstractPyReportParser<QuarterlyReportData> {
+    public PythonQuarterlyReportParser(DaqProperties properties, FundInfoMapper fundInfoMapper) {
+        super(properties, fundInfoMapper);
+    }
+
+    @Override
+    protected ReportType getReportType() {
+        return ReportType.QUARTERLY;
+    }
+
+    @Override
+    public String getParser() {
+        return ReportParserConstant.PARSER_PYTHON_QUARTERLY;
+    }
+}

+ 56 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/writer/AbstractReportWriter.java

@@ -0,0 +1,56 @@
+package com.simuwang.daq.components.report.writer;
+
+import com.simuwang.base.mapper.report.ReportBaseInfoMapper;
+import com.simuwang.base.mapper.report.ReportFundInfoMapper;
+import com.simuwang.base.pojo.dos.report.ReportBaseInfoDO;
+import com.simuwang.base.pojo.dos.report.ReportFundInfoDO;
+import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
+import com.simuwang.base.pojo.dto.report.ReportData;
+import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.transaction.annotation.Transactional;
+
+public abstract class AbstractReportWriter<T extends ReportData> implements ReportWriter<T> {
+    private final Logger logger = LoggerFactory.getLogger(this.getClass());
+
+    private final ReportBaseInfoMapper baseInfoMapper;
+    private final ReportFundInfoMapper fundInfoMapper;
+
+    public AbstractReportWriter(ReportBaseInfoMapper baseInfoMapper, ReportFundInfoMapper fundInfoMapper) {
+        this.baseInfoMapper = baseInfoMapper;
+        this.fundInfoMapper = fundInfoMapper;
+    }
+
+    @Override
+    @Transactional(rollbackFor = Exception.class)
+    public void write(T reportData) {
+        if (reportData == null) {
+            this.logger.error("The report no result!");
+            return;
+        }
+        // 基本信息+基金信息保存
+        this.saveBaseInfo(reportData);
+        this.saveFundInfo(reportData);
+        // 其他信息保存
+        this.writeExtData(reportData);
+    }
+
+    private void saveBaseInfo(T reportData) {
+        ReportBaseInfoDTO baseInfo = reportData.getBaseInfo();
+        if (baseInfo != null) {
+            ReportBaseInfoDO entity = baseInfo.toEntity();
+            this.baseInfoMapper.insert(entity);
+        }
+    }
+
+    private void saveFundInfo(T reportData) {
+        ReportFundInfoDTO fundInfo = reportData.getFundInfo();
+        if (fundInfo != null) {
+            ReportFundInfoDO entity = fundInfo.toEntity();
+            this.fundInfoMapper.insert(entity);
+        }
+    }
+
+    protected abstract void writeExtData(T reportData);
+}

+ 1 - 1
service-daq/src/main/java/com/simuwang/daq/components/writer/AnnuallyReportWriter.java

@@ -1,4 +1,4 @@
-package com.simuwang.daq.components.writer;
+package com.simuwang.daq.components.report.writer;
 
 import com.simuwang.base.mapper.report.*;
 import com.simuwang.base.pojo.dto.report.AnnuallyReportData;

+ 1 - 1
service-daq/src/main/java/com/simuwang/daq/components/writer/MonthlyReportWriter.java

@@ -1,4 +1,4 @@
-package com.simuwang.daq.components.writer;
+package com.simuwang.daq.components.report.writer;
 
 import cn.hutool.core.collection.CollUtil;
 import com.simuwang.base.mapper.report.ReportBaseInfoMapper;

+ 1 - 1
service-daq/src/main/java/com/simuwang/daq/components/writer/QuarterlyReportWriter.java

@@ -1,4 +1,4 @@
-package com.simuwang.daq.components.writer;
+package com.simuwang.daq.components.report.writer;
 
 import cn.hutool.core.collection.CollUtil;
 import com.simuwang.base.mapper.report.*;

+ 12 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/writer/ReportWriter.java

@@ -0,0 +1,12 @@
+package com.simuwang.daq.components.report.writer;
+
+import com.simuwang.base.pojo.dto.report.ReportData;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/29 14:06
+ * @description 报告存储保存的服务业务(可以扩展支持保存到本地缓存或文件)
+ */
+public interface ReportWriter<T extends ReportData> {
+    void write(T reportData);
+}

+ 1 - 1
service-daq/src/main/java/com/simuwang/daq/components/writer/ReportWriterConstant.java

@@ -1,4 +1,4 @@
-package com.simuwang.daq.components.writer;
+package com.simuwang.daq.components.report.writer;
 
 import cn.hutool.core.map.MapUtil;
 import com.simuwang.base.common.enums.ReportType;

+ 1 - 1
service-daq/src/main/java/com/simuwang/daq/components/writer/ReportWriterFactory.java

@@ -1,4 +1,4 @@
-package com.simuwang.daq.components.writer;
+package com.simuwang.daq.components.report.writer;
 
 import cn.hutool.core.map.MapUtil;
 import com.simuwang.base.common.enums.ReportType;

+ 0 - 73
service-daq/src/main/java/com/simuwang/daq/components/writer/AbstractReportWriter.java

@@ -1,73 +0,0 @@
-package com.simuwang.daq.components.writer;
-
-import cn.hutool.core.exceptions.ExceptionUtil;
-import com.simuwang.base.mapper.report.ReportBaseInfoMapper;
-import com.simuwang.base.mapper.report.ReportFundInfoMapper;
-import com.simuwang.base.pojo.dos.report.ReportBaseInfoDO;
-import com.simuwang.base.pojo.dos.report.ReportFundInfoDO;
-import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
-import com.simuwang.base.pojo.dto.report.ReportData;
-import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.transaction.annotation.Transactional;
-import org.springframework.util.StopWatch;
-
-public abstract class AbstractReportWriter<T extends ReportData> implements ReportWriter<T> {
-    private final Logger logger = LoggerFactory.getLogger(this.getClass());
-
-    private final ReportBaseInfoMapper baseInfoMapper;
-    private final ReportFundInfoMapper fundInfoMapper;
-
-    public AbstractReportWriter(ReportBaseInfoMapper baseInfoMapper, ReportFundInfoMapper fundInfoMapper) {
-        this.baseInfoMapper = baseInfoMapper;
-        this.fundInfoMapper = fundInfoMapper;
-    }
-
-    @Override
-    @Transactional(rollbackFor = Exception.class)
-    public void write(T reportData) {
-        StopWatch stopWatch = new StopWatch();
-        stopWatch.start();
-        // 基本信息+基金信息保存
-        this.saveBaseInfo(reportData);
-        this.saveFundInfo(reportData);
-        try {
-            // 其他信息保存
-            this.writeExtData(reportData);
-        } catch (Exception e) {
-            this.logger.error("报告解析结果之类型特有数据保存报错\n{}", ExceptionUtil.stacktraceToString(e));
-        }
-        stopWatch.stop();
-        long totalTimeMillis = stopWatch.getTotalTimeMillis();
-        if (this.logger.isInfoEnabled()) {
-            this.logger.info("报告解析结果保存成功,耗时:{}ms", totalTimeMillis);
-        }
-    }
-
-    private void saveBaseInfo(T reportData) {
-        try {
-            ReportBaseInfoDTO baseInfo = reportData.getBaseInfo();
-            if (baseInfo != null) {
-                ReportBaseInfoDO entity = baseInfo.toEntity();
-                this.baseInfoMapper.insert(entity);
-            }
-        } catch (Exception e) {
-            this.logger.error("报告解析结果之报告基本信息保存报错\n{}", ExceptionUtil.stacktraceToString(e));
-        }
-    }
-
-    private void saveFundInfo(T reportData) {
-        try {
-            ReportFundInfoDTO fundInfo = reportData.getFundInfo();
-            if (fundInfo != null) {
-                ReportFundInfoDO entity = fundInfo.toEntity();
-                this.fundInfoMapper.insert(entity);
-            }
-        } catch (Exception e) {
-            this.logger.error("报告解析结果之基金信息保存报错\n{}", ExceptionUtil.stacktraceToString(e));
-        }
-    }
-
-    protected abstract void writeExtData(T reportData);
-}

+ 0 - 7
service-daq/src/main/java/com/simuwang/daq/components/writer/ReportWriter.java

@@ -1,7 +0,0 @@
-package com.simuwang.daq.components.writer;
-
-import com.simuwang.base.pojo.dto.report.ReportData;
-
-public interface ReportWriter<T extends ReportData> {
-    void write(T reportData);
-}

+ 0 - 43
service-daq/src/main/java/com/simuwang/daq/dto/MonthlyReportNavInfo.java

@@ -1,43 +0,0 @@
-package com.simuwang.daq.dto;
-
-public class MonthlyReportNavInfo extends ReportExtInfo {
-    private String valuationDate;
-
-    private String nav;
-
-    private String endTotalShares;
-
-    private String fundAssetSize;
-
-    public String getValuationDate() {
-        return valuationDate;
-    }
-
-    public void setValuationDate(String valuationDate) {
-        this.valuationDate = valuationDate;
-    }
-
-    public String getNav() {
-        return nav;
-    }
-
-    public void setNav(String nav) {
-        this.nav = nav;
-    }
-
-    public String getEndTotalShares() {
-        return endTotalShares;
-    }
-
-    public void setEndTotalShares(String endTotalShares) {
-        this.endTotalShares = endTotalShares;
-    }
-
-    public String getFundAssetSize() {
-        return fundAssetSize;
-    }
-
-    public void setFundAssetSize(String fundAssetSize) {
-        this.fundAssetSize = fundAssetSize;
-    }
-}

+ 0 - 13
service-daq/src/main/java/com/simuwang/daq/dto/ReportExtInfo.java

@@ -1,13 +0,0 @@
-package com.simuwang.daq.dto;
-
-public class ReportExtInfo {
-    private Integer fileId;
-
-    public Integer getFileId() {
-        return fileId;
-    }
-
-    public void setFileId(Integer fileId) {
-        this.fileId = fileId;
-    }
-}

+ 0 - 18
service-daq/src/main/java/com/simuwang/daq/dto/ReportFileType.java

@@ -1,18 +0,0 @@
-package com.simuwang.daq.dto;
-
-public enum ReportFileType {
-    PDF("pdf"),
-    DOCX("docx"),
-    DOC("doc"),
-    XLSX("xlsx");
-
-    private final String suffix;
-
-    ReportFileType(String suffix) {
-        this.suffix = suffix;
-    }
-
-    public String getSuffix() {
-        return suffix;
-    }
-}

+ 0 - 291
service-daq/src/main/java/com/simuwang/daq/dto/ReportFundInfo.java

@@ -1,291 +0,0 @@
-package com.simuwang.daq.dto;
-
-/**
- * @author wangzaijun
- * @date 2024/9/12 15:34
- * @description 报告解析的基金信息
- */
-public class ReportFundInfo {
-    private String fundName;
-    private String cFundName;
-    /**
-     * 是否分级基金
-     */
-    private Integer istiered;
-    /**
-     * 备案编码
-     */
-    private String registerNumber;
-    private String trustName;
-    private String custodianName;
-    private String advisorName;
-    /**
-     * 运作方式 开放式或封闭式
-     */
-    private String operationType;
-    private String fundType;
-    /**
-     * 成立日期
-     */
-    private String inceptionDate;
-    private String sharePerAsset;
-    private String investmentObjective;
-    private String fundStrategyDescription;
-    private String secondaryBenchmark;
-    private String riskReturnDesc;
-    private String realizedIncome;
-    private String profit;
-    private String fundAssetSize;
-    private String nav;
-    private String initTotalShares;
-    private String subscription;
-    private String redemption;
-    private String split;
-    /**
-     * 杠杆信息描述
-     */
-    private String leverageNote;
-    /**
-     * 杠杆比例
-     */
-    private String leverage;
-    private String remark;
-    private String industryTrend;
-    private String fundManager;
-    /**
-     * 是否托管复核
-     */
-    private String reviewed;
-
-    public String getFundName() {
-        return fundName;
-    }
-
-    public void setFundName(String fundName) {
-        this.fundName = fundName;
-    }
-
-    public String getcFundName() {
-        return cFundName;
-    }
-
-    public void setcFundName(String cFundName) {
-        this.cFundName = cFundName;
-    }
-
-    public Integer getIstiered() {
-        return istiered;
-    }
-
-    public void setIstiered(Integer istiered) {
-        this.istiered = istiered;
-    }
-
-    public String getRegisterNumber() {
-        return registerNumber;
-    }
-
-    public void setRegisterNumber(String registerNumber) {
-        this.registerNumber = registerNumber;
-    }
-
-    public String getTrustName() {
-        return trustName;
-    }
-
-    public void setTrustName(String trustName) {
-        this.trustName = trustName;
-    }
-
-    public String getCustodianName() {
-        return custodianName;
-    }
-
-    public void setCustodianName(String custodianName) {
-        this.custodianName = custodianName;
-    }
-
-    public String getAdvisorName() {
-        return advisorName;
-    }
-
-    public void setAdvisorName(String advisorName) {
-        this.advisorName = advisorName;
-    }
-
-    public String getOperationType() {
-        return operationType;
-    }
-
-    public void setOperationType(String operationType) {
-        this.operationType = operationType;
-    }
-
-    public String getFundType() {
-        return fundType;
-    }
-
-    public void setFundType(String fundType) {
-        this.fundType = fundType;
-    }
-
-    public String getInceptionDate() {
-        return inceptionDate;
-    }
-
-    public void setInceptionDate(String inceptionDate) {
-        this.inceptionDate = inceptionDate;
-    }
-
-    public String getSharePerAsset() {
-        return sharePerAsset;
-    }
-
-    public void setSharePerAsset(String sharePerAsset) {
-        this.sharePerAsset = sharePerAsset;
-    }
-
-    public String getInvestmentObjective() {
-        return investmentObjective;
-    }
-
-    public void setInvestmentObjective(String investmentObjective) {
-        this.investmentObjective = investmentObjective;
-    }
-
-    public String getFundStrategyDescription() {
-        return fundStrategyDescription;
-    }
-
-    public void setFundStrategyDescription(String fundStrategyDescription) {
-        this.fundStrategyDescription = fundStrategyDescription;
-    }
-
-    public String getSecondaryBenchmark() {
-        return secondaryBenchmark;
-    }
-
-    public void setSecondaryBenchmark(String secondaryBenchmark) {
-        this.secondaryBenchmark = secondaryBenchmark;
-    }
-
-    public String getRiskReturnDesc() {
-        return riskReturnDesc;
-    }
-
-    public void setRiskReturnDesc(String riskReturnDesc) {
-        this.riskReturnDesc = riskReturnDesc;
-    }
-
-    public String getRealizedIncome() {
-        return realizedIncome;
-    }
-
-    public void setRealizedIncome(String realizedIncome) {
-        this.realizedIncome = realizedIncome;
-    }
-
-    public String getProfit() {
-        return profit;
-    }
-
-    public void setProfit(String profit) {
-        this.profit = profit;
-    }
-
-    public String getFundAssetSize() {
-        return fundAssetSize;
-    }
-
-    public void setFundAssetSize(String fundAssetSize) {
-        this.fundAssetSize = fundAssetSize;
-    }
-
-    public String getNav() {
-        return nav;
-    }
-
-    public void setNav(String nav) {
-        this.nav = nav;
-    }
-
-    public String getInitTotalShares() {
-        return initTotalShares;
-    }
-
-    public void setInitTotalShares(String initTotalShares) {
-        this.initTotalShares = initTotalShares;
-    }
-
-    public String getSubscription() {
-        return subscription;
-    }
-
-    public void setSubscription(String subscription) {
-        this.subscription = subscription;
-    }
-
-    public String getRedemption() {
-        return redemption;
-    }
-
-    public void setRedemption(String redemption) {
-        this.redemption = redemption;
-    }
-
-    public String getSplit() {
-        return split;
-    }
-
-    public void setSplit(String split) {
-        this.split = split;
-    }
-
-    public String getLeverageNote() {
-        return leverageNote;
-    }
-
-    public void setLeverageNote(String leverageNote) {
-        this.leverageNote = leverageNote;
-    }
-
-    public String getLeverage() {
-        return leverage;
-    }
-
-    public void setLeverage(String leverage) {
-        this.leverage = leverage;
-    }
-
-    public String getRemark() {
-        return remark;
-    }
-
-    public void setRemark(String remark) {
-        this.remark = remark;
-    }
-
-    public String getIndustryTrend() {
-        return industryTrend;
-    }
-
-    public void setIndustryTrend(String industryTrend) {
-        this.industryTrend = industryTrend;
-    }
-
-    public String getFundManager() {
-        return fundManager;
-    }
-
-    public void setFundManager(String fundManager) {
-        this.fundManager = fundManager;
-    }
-
-    public String getReviewed() {
-        return reviewed;
-    }
-
-    public void setReviewed(String reviewed) {
-        this.reviewed = reviewed;
-    }
-}

+ 0 - 54
service-daq/src/main/java/com/simuwang/daq/dto/ReportInfo.java

@@ -1,54 +0,0 @@
-package com.simuwang.daq.dto;
-
-/**
- * @author wangzaijun
- * @date 2024/9/11 17:57
- * @description 报告基本信息
- */
-public class ReportInfo {
-    private Integer fileId;
-    /**
-     * 报告名称
-     */
-    private String reportName;
-    /**
-     * 报告类型(月、季、年)
-     */
-    private String reportType;
-    /**
-     * 报告日期
-     */
-    private String reportDate;
-
-    public Integer getFileId() {
-        return fileId;
-    }
-
-    public void setFileId(Integer fileId) {
-        this.fileId = fileId;
-    }
-
-    public String getReportName() {
-        return reportName;
-    }
-
-    public void setReportName(String reportName) {
-        this.reportName = reportName;
-    }
-
-    public String getReportType() {
-        return reportType;
-    }
-
-    public void setReportType(String reportType) {
-        this.reportType = reportType;
-    }
-
-    public String getReportDate() {
-        return reportDate;
-    }
-
-    public void setReportDate(String reportDate) {
-        this.reportDate = reportDate;
-    }
-}

+ 97 - 71
service-daq/src/main/java/com/simuwang/daq/service/EmailParseService.java

@@ -8,9 +8,10 @@ import cn.hutool.core.date.DateUtil;
 import cn.hutool.core.exceptions.ExceptionUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
-import cn.hutool.http.HttpUtil;
-import cn.hutool.json.JSONUtil;
 import com.simuwang.base.common.conts.*;
+import com.simuwang.base.common.enums.ReportParserFileType;
+import com.simuwang.base.common.enums.ReportType;
+import com.simuwang.base.common.exception.ReportParseException;
 import com.simuwang.base.common.util.EmailUtil;
 import com.simuwang.base.common.util.ExcelUtil;
 import com.simuwang.base.common.util.FileUtil;
@@ -21,11 +22,15 @@ import com.simuwang.base.pojo.dos.*;
 import com.simuwang.base.pojo.dto.EmailContentInfoDTO;
 import com.simuwang.base.pojo.dto.EmailFundNavDTO;
 import com.simuwang.base.pojo.dto.MailboxInfoDTO;
-import com.simuwang.base.pojo.dto.report.PythonResult;
+import com.simuwang.base.pojo.dto.report.ParseResult;
 import com.simuwang.base.pojo.dto.report.ReportData;
+import com.simuwang.base.pojo.dto.report.ReportParseStatus;
+import com.simuwang.base.pojo.dto.report.ReportParserParams;
 import com.simuwang.base.pojo.valuation.CmValuationTableAttribute;
-import com.simuwang.daq.components.PythonReportConverter;
-import com.simuwang.daq.components.writer.ReportWriterFactory;
+import com.simuwang.daq.components.report.parser.ReportParser;
+import com.simuwang.daq.components.report.parser.ReportParserFactory;
+import com.simuwang.daq.components.report.writer.ReportWriter;
+import com.simuwang.daq.components.report.writer.ReportWriterFactory;
 import jakarta.mail.*;
 import jakarta.mail.internet.MimeMessage;
 import jakarta.mail.internet.MimeMultipart;
@@ -37,6 +42,7 @@ import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;
+import org.springframework.util.StopWatch;
 
 import java.io.File;
 import java.math.BigDecimal;
@@ -53,10 +59,8 @@ import java.util.stream.Collectors;
 @Service
 public class EmailParseService {
 
+    public static final int stepSize = 10000;
     private static final Logger log = LoggerFactory.getLogger(EmailParseService.class);
-
-    private final String pyBaseUrl;
-
     private final EmailTypeRuleMapper emailTypeRuleMapper;
     private final EmailRuleConfig emailRuleConfig;
     private final EmailFieldMappingMapper emailFieldMapper;
@@ -75,19 +79,22 @@ public class EmailParseService {
 
     @Value("${email.file.path}")
     private String path;
+
     @Autowired
-    private FundInfoMapper fundInfoMapper;
+    private DaqProperties properties;
+
+    /* 报告解析和入库的方法 */
+    @Autowired
+    private ReportParserFactory reportParserFactory;
     @Autowired
     private ReportWriterFactory reportWriterFactory;
 
-    public static final int stepSize = 10000;
-
     public EmailParseService(EmailTypeRuleMapper emailTypeRuleMapper, EmailRuleConfig emailRuleConfig,
                              EmailFieldMappingMapper emailFieldMapper, EmailParserFactory emailParserFactory,
                              EmailParseInfoMapper emailParseInfoMapper, EmailFileInfoMapper emailFileInfoMapper,
                              EmailFundNavMapper emailFundNavMapper, EmailFundAssetMapper emailFundAssetMapper,
                              AssetMapper assetMapper, NavMapper navMapper, FundService fundService,
-                             FundAliasMapper fundAliasMapper, DaqProperties properties,
+                             FundAliasMapper fundAliasMapper,
                              ValuationTableMapper valuationTableMapper, ValuationTableAttributeMapper valuationTableAttributeMapper,
                              FundPositionDetailMapper fundPositionDetailMapper) {
         this.emailTypeRuleMapper = emailTypeRuleMapper;
@@ -103,7 +110,6 @@ public class EmailParseService {
         this.fundService = fundService;
         this.fundAliasMapper = fundAliasMapper;
 
-        this.pyBaseUrl = properties.getPyBaseUrl();
         this.valuationTableMapper = valuationTableMapper;
         this.valuationTableAttributeMapper = valuationTableAttributeMapper;
         this.fundPositionDetailMapper = fundPositionDetailMapper;
@@ -175,7 +181,7 @@ public class EmailParseService {
         emailId = saveEmailParseInfo(emailParseInfoDO);
 
         // python 报告解析接口结果
-        List<ReportData> dataList = ListUtil.list(false);
+        List<ParseResult<ReportData>> dataList = ListUtil.list(false);
         for (Map.Entry<EmailContentInfoDTO, List<EmailFundNavDTO>> fileNameNavEntry : fileNameNavMap.entrySet()) {
             // 保存邮件文件表
             EmailContentInfoDTO emailContentInfoDTO = fileNameNavEntry.getKey();
@@ -193,12 +199,10 @@ public class EmailParseService {
             if (CollUtil.isEmpty(fundNavDTOList) && !Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType)) {
                 continue;
             }
-            // python接口解析结果
-            ReportData data = this.requestPyAndResult(fileId, emailContentInfoDTO);
-            if (data != null) {
-                // 保存报告解析数据
-                this.reportWriterFactory.getInstance(data.getReportType()).write(data);
-                dataList.add(data);
+            if (Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType)) {
+                // 解析结果(可以从python获取或者自行解析)并保存报告
+                ParseResult<ReportData> parseResult = this.parseReportAndHandleResult(fileId, emailContentInfoDTO);
+                dataList.add(parseResult);
             }
             for (EmailFundNavDTO fundNavDTO : fundNavDTOList) {
                 // 设置净值数据的解析状态
@@ -212,11 +216,6 @@ public class EmailParseService {
         // 更新邮件解析结果 -> 当【净值日期】和【备案编码/基金名称】能正常解读,即识别为【成功】
         long successNavCount = fileNameNavMap.values().stream().flatMap(List::stream).filter(e -> e != null && StrUtil.isBlank(e.getFailReason())).count();
         emailParseStatus = successNavCount >= 1 ? EmailParseStatusConst.SUCCESS : EmailParseStatusConst.FAIL;
-        // 报告邮件有一条成功就表示整体成功
-        if (Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType) && CollUtil.isNotEmpty(dataList)) {
-            long count = dataList.size();
-            emailParseStatus = count >= 1 ? EmailParseStatusConst.SUCCESS : EmailParseStatusConst.FAIL;
-        }
         String failReason = null;
         if (emailParseStatus == EmailParseStatusConst.FAIL) {
             // 邮件解析失败时 -> 保存失败原因
@@ -224,6 +223,14 @@ public class EmailParseService {
             List<EmailFundNavDTO> navDTOList = fileNameNavMap.values().stream().flatMap(List::stream).toList();
             failReason = hasPdfFile == 1 && CollUtil.isEmpty(navDTOList) ? "无法从pdf文件中获取到数据" : navDTOList.stream().map(EmailFundNavDTO::getFailReason).distinct().collect(Collectors.joining("/"));
         }
+        // 报告邮件有一条失败就表示整个邮件解析失败
+        if (Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType) && CollUtil.isNotEmpty(dataList)) {
+            failReason = dataList.stream().filter(e -> !Objects.equals(1, e.getStatus()))
+                    .findFirst().map(ParseResult::getMsg).orElse(null);
+            if (failReason != null) {
+                emailParseStatus = EmailParseStatusConst.FAIL;
+            }
+        }
         emailParseInfoMapper.updateParseStatus(emailId, emailParseStatus, failReason);
     }
 
@@ -361,61 +368,80 @@ public class EmailParseService {
         }).collect(Collectors.toList());
     }
 
-    private ReportData requestPyAndResult(int fileId, EmailContentInfoDTO emailContentInfoDTO) {
+    private ParseResult<ReportData> parseReportAndHandleResult(int fileId, EmailContentInfoDTO emailContentInfoDTO) {
+        ParseResult<ReportData> result = new ParseResult<>();
         String fileName = emailContentInfoDTO.getFileName();
         Integer emailType = emailContentInfoDTO.getEmailType();
+        if (!Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType) || StrUtil.isBlank(fileName)) {
+            result.setStatus(ReportParseStatus.NOT_A_REPORT.getCode());
+            result.setMsg(ReportParseStatus.NOT_A_REPORT.getMsg());
+            return result;
+        }
+        Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
+        Matcher matcher = pattern.matcher(fileName);
+        String registerNumber = null;
+        if (matcher.find()) {
+            registerNumber = matcher.group();
+        }
+        // 类型识别---先识别季度报告,没有季度再识别年度报告,最后识别月报
+        ReportType reportType = ReportType.MONTHLY;
+        if (StrUtil.containsAny(fileName, ReportType.QUARTERLY.getPatterns())) {
+            reportType = ReportType.QUARTERLY;
+        } else if (StrUtil.containsAny(fileName, ReportType.ANNUALLY.getPatterns())) {
+            reportType = ReportType.ANNUALLY;
+        }
+        // 解析器--如果开启python解析则直接调用python接口,否则根据文件后缀获取对应解析器
+        ReportParserFileType fileType;
+        if (Objects.equals(Boolean.TRUE, this.properties.getEnablePyParser())) {
+            fileType = ReportParserFileType.PYTHON;
+        } else {
+            String fileSuffix = StrUtil.subAfter(fileName, ".", true);
+            fileType = ReportParserFileType.getBySuffix(fileSuffix);
+        }
+        // 解析报告
+        ReportParserParams params = null;
         ReportData reportData = null;
-        if (Objects.equals(EmailTypeConst.REPORT_EMAIL_TYPE, emailType)) {
-            if (StrUtil.isBlank(fileName)) {
-                return null;
-            }
-            Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
-            Matcher matcher = pattern.matcher(fileName);
-            String registerNumber = null;
-            if (matcher.find()) {
-                registerNumber = matcher.group();
-            }
-            int type = 0;
-            if (fileName.contains("季报") || fileName.contains("季度")) {
-                type = 1;
-            } else if (fileName.contains("年报") || fileName.contains("年度")) {
-                type = 2;
-            }
-            String api = "/api/v1/parse/amac_report";
-            Map<String, Object> params = MapUtil.newHashMap(16);
-            params.put("file_id", fileId);
-            params.put("file_path", emailContentInfoDTO.getFilePath());
-            params.put("register_number", registerNumber);
-            params.put("file_type", type);
-            params.put("file_name", fileName);
-            if (StrUtil.isNotBlank(registerNumber)) {
-                FundAndCompanyInfoDO info = this.fundInfoMapper.queryFundAndTrustByRegisterNumber(registerNumber);
-                if (info != null) {
-                    params.put("fund_name", info.getFundName());
-                    params.put("trust_name", info.getCompanyName());
-                }
+        StopWatch parserWatch = new StopWatch();
+        parserWatch.start();
+        try {
+            params = ReportParserParams.builder().fileId(fileId).filename(fileName)
+                    .filepath(emailContentInfoDTO.getFilePath()).registerNumber(registerNumber).build();
+            ReportParser<ReportData> instance = this.reportParserFactory.getInstance(reportType, fileType);
+            reportData = instance.parse(params);
+            result.setStatus(1);
+            result.setMsg("报告解析成功");
+            result.setData(reportData);
+        } catch (ReportParseException e) {
+            log.error("报告{}解析失败\n{}", params, e.getMsg());
+            result.setStatus(e.getCode());
+            result.setMsg(e.getMsg());
+        } catch (Exception e) {
+            log.error("报告{}解析失败\n{}", params, ExceptionUtil.stacktraceToString(e));
+            result.setStatus(ReportParseStatus.PARSE_FAIL.getCode());
+            result.setMsg(StrUtil.format(ReportParseStatus.PARSE_FAIL.getMsg(), e.getMessage()));
+        } finally {
+            parserWatch.stop();
+            if (log.isInfoEnabled()) {
+                log.info("报告{}解析结果为{},耗时{}ms", params, reportData, parserWatch.getTotalTimeMillis());
             }
-            long millis = System.currentTimeMillis();
+        }
+        // 保存报告解析结果
+        if (reportData != null) {
+            StopWatch writeWatch = new StopWatch();
+            writeWatch.start();
             try {
-                String body = HttpUtil.post(this.pyBaseUrl + api, JSONUtil.toJsonStr(params));
-                PythonResult<?> result = PythonReportConverter.convert(JSONUtil.parseObj(body), type);
-                if (!Objects.equals(1, result.getStatus())) {
-                    log.warn("报告{} 解析失败:{}", params, result.getMsg());
-                    return null;
-                }
-                reportData = result.getData();
-                if (log.isInfoEnabled()) {
-                    log.info("报告{}结果为:\n{}", params, reportData);
-                }
+                ReportWriter<ReportData> instance = this.reportWriterFactory.getInstance(reportType);
+                instance.write(reportData);
             } catch (Exception e) {
-                log.error("请求python的报告解析接口报错\n{}", ExceptionUtil.stacktraceToString(e));
+                log.error("报告{}结果保存失败\n{}", params, ExceptionUtil.stacktraceToString(e));
             } finally {
+                writeWatch.stop();
                 if (log.isInfoEnabled()) {
-                    log.info("当前报告{}解析完成,总计耗时{}ms", params, (System.currentTimeMillis() - millis));
+                    log.info("报告{}解析结果保存完成,耗时{}ms", params, writeWatch.getTotalTimeMillis());
                 }
             }
         }
-        return reportData;
+        return result;
     }
 
     private void saveNavAndAssetNet(Integer fileId, List<EmailFundNavDTO> fundNavDTOList, Date parseDate) {
@@ -829,8 +855,8 @@ public class EmailParseService {
                     emailContentInfoDTOList.add(emailContentInfoDTO);
                 }
                 if (CollUtil.isNotEmpty(emailContentInfoDTOList)) {
-                    // 估值表邮件不展示正文html文件
-                    if (emailType.equals(EmailTypeConst.VALUATION_EMAIL_TYPE)) {
+                    // 估值表或定期报告邮件不展示正文html文件
+                    if (emailType.equals(EmailTypeConst.VALUATION_EMAIL_TYPE) || emailType.equals(EmailTypeConst.REPORT_EMAIL_TYPE)) {
                         emailContentInfoDTOList = emailContentInfoDTOList.stream().filter(e -> !ExcelUtil.isHTML(e.getFilePath())).toList();
                     }
                     emailContentInfoDTOList.forEach(e -> {

+ 4 - 2
service-daq/src/main/java/com/simuwang/daq/service/ReportEmailParser.java

@@ -4,6 +4,7 @@ import cn.hutool.core.collection.ListUtil;
 import com.simuwang.base.common.conts.EmailTypeConst;
 import com.simuwang.base.pojo.dto.EmailContentInfoDTO;
 import com.simuwang.base.pojo.dto.EmailFundNavDTO;
+import com.simuwang.daq.components.report.parser.pdf.AbstractPDReportParser;
 import org.springframework.stereotype.Component;
 
 import java.util.List;
@@ -12,7 +13,8 @@ import java.util.Map;
 /**
  * @author wangzaijun
  * @date 2024/9/25 14:52
- * @description 报告的解析逻辑,目前先调用python接口
+ * @description 报告的解析逻辑
+ * @see com.simuwang.daq.components.report.parser.ReportParser,com.simuwang.daq.components.report.parser.py.AbstractPyReportParser, AbstractPDReportParser
  */
 @Component
 public class ReportEmailParser extends AbstractEmailParser {
@@ -24,7 +26,7 @@ public class ReportEmailParser extends AbstractEmailParser {
 
     @Override
     public List<EmailFundNavDTO> parse(EmailContentInfoDTO emailContentInfoDTO, Map<String, List<String>> emailFieldMap) {
-        // 目前啥也不做,调用python的逻辑在EmailParseService里写死,等java的逻辑完成后注释掉python逻辑
+        // 目前啥也不做,但是要返回空集合并且支持报告解析
         return ListUtil.empty();
     }
 }

+ 0 - 20
service-daq/src/main/java/com/simuwang/daq/service/ReportParseService.java

@@ -1,20 +0,0 @@
-package com.simuwang.daq.service;
-
-import com.simuwang.daq.components.ReportParser;
-import org.springframework.stereotype.Service;
-
-@Service
-public class ReportParseService {
-    private final ReportParser parser;
-
-    public ReportParseService(ReportParser parser) {
-        this.parser = parser;
-    }
-
-    public void parse() {
-//        this.parser.parse(1, "D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf", "幻方量化1000指数专享1号5期私募证券投资基金宁波幻方量化投资管理合伙企业(有限合伙)");
-        this.parser.parse(1,
-                "D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf",
-                "古曲泉发一号私募证券投资基金上海古曲私募基金管理有限公司");
-    }
-}

+ 269 - 255
service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java

@@ -1,50 +1,63 @@
-package com.simuwang.daq.utils;
-
-import cn.hutool.core.map.MapUtil;
-import cn.hutool.core.util.StrUtil;
-import cn.hutool.http.HttpUtil;
-import cn.hutool.json.JSONObject;
-import cn.hutool.json.JSONUtil;
-import com.simuwang.base.pojo.dto.report.PythonResult;
-import com.simuwang.daq.components.PythonReportConverter;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.common.PDStream;
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-
-import java.io.IOException;
-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public class ReportParseUtil {
-    public static void main(String[] args) throws IOException {
-        String fileName = "SJM970_排排精选进取一号私募证券投资基金_2022年第4季度报告.pdf";
-        Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
-        Matcher matcher = pattern.matcher(fileName);
-        String registerNumber = null;
-        if (matcher.find()) {
-            registerNumber = matcher.group();
-        }
-
-        int type = 1;
-        String baseUrl = "http://192.168.0.81:8088";
-        String api = "/api/v1/parse/amac_report";
-        Map<String, Object> params = MapUtil.newHashMap(16);
-        params.put("file_id", 111112);
-        params.put("file_path", "E:/workproject/fastparse/src/fastparse/static/reports/quarterly_report/13445.pdf");
-        params.put("register_number", registerNumber);
-        params.put("file_type", type);
-        params.put("file_name", fileName);
-        params.put("fund_name", null);
-        params.put("trust_name", null);
-        String body = HttpUtil.post(baseUrl + api, JSONUtil.toJsonStr(params));
-        JSONObject obj = JSONUtil.parseObj(body);
-        PythonResult<?> result = PythonReportConverter.convert(obj, type);
-        System.out.println(result);
-
+//package com.simuwang.daq.utils;
+//
+//import cn.hutool.core.collection.ListUtil;
+//import cn.hutool.core.map.MapUtil;
+//import cn.hutool.core.util.ReflectUtil;
+//import cn.hutool.core.util.StrUtil;
+//import cn.hutool.http.HttpUtil;
+//import cn.hutool.json.JSONObject;
+//import cn.hutool.json.JSONUtil;
+//import com.simuwang.base.common.conts.Constants;
+//import com.simuwang.base.pojo.dto.report.PythonResult;
+//import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
+//import com.simuwang.daq.components.CustomPDFTextStripper;
+//import com.simuwang.daq.components.PythonReportConverter;
+//import com.smppw.common.pojo.ValueLabelVO;
+//import org.apache.pdfbox.Loader;
+//import org.apache.pdfbox.cos.COSName;
+//import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
+//import org.apache.pdfbox.pdmodel.PDDocument;
+//import org.apache.pdfbox.pdmodel.PDPage;
+//import org.apache.pdfbox.pdmodel.PDResources;
+//import org.apache.pdfbox.pdmodel.common.PDStream;
+//import org.apache.pdfbox.pdmodel.graphics.PDXObject;
+//import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+//import org.apache.pdfbox.text.PDFTextStripper;
+//import technology.tabula.*;
+//import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+//
+//import java.io.IOException;
+//import java.util.*;
+//import java.util.regex.Matcher;
+//import java.util.regex.Pattern;
+//import java.util.stream.Collectors;
+//
+//public class ReportParseUtil {
+//    public static void main(String[] args) throws IOException {
+////        String fileName = "SJM970_排排精选进取一号私募证券投资基金_2022年第4季度报告.pdf";
+////        Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
+////        Matcher matcher = pattern.matcher(fileName);
+////        String registerNumber = null;
+////        if (matcher.find()) {
+////            registerNumber = matcher.group();
+////        }
+////
+////        int type = 1;
+////        String baseUrl = "http://192.168.0.81:8088";
+////        String api = "/api/v1/parse/amac_report";
+////        Map<String, Object> params = MapUtil.newHashMap(16);
+////        params.put("file_id", 111112);
+////        params.put("file_path", "E:/workproject/fastparse/src/fastparse/static/reports/quarterly_report/13445.pdf");
+////        params.put("register_number", registerNumber);
+////        params.put("file_type", type);
+////        params.put("file_name", fileName);
+////        params.put("fund_name", null);
+////        params.put("trust_name", null);
+////        String body = HttpUtil.post(baseUrl + api, JSONUtil.toJsonStr(params));
+////        JSONObject obj = JSONUtil.parseObj(body);
+////        PythonResult<?> result = PythonReportConverter.convert(obj, type);
+////        System.out.println(result);
+//
 //        List<ValueLabelVO> fieldMapper = ListUtil.list(false);
 //        fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
 //        fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
@@ -56,12 +69,12 @@ public class ReportParseUtil {
 //        fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
 //        fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
 //
-//        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
-//        List<String> watermarks = watermarkMap.get("less");
+////        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
+////        List<String> watermarks = watermarkMap.get("less");
 //
 ////        System.out.println(watermarks);
 ////        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
-//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
+//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("C:\\Users\\Administrator\\Desktop\\self\\新报告解析\\基协报告\\季报\\SVP311_私募基金季报PDF_国恩回报6号增强私募证券投资基金_2024年06月30日.pdf"))) {
 ////            PDFTextStripper stripper = new PDFTextStripper();
 ////            stripper.setSortByPosition(true);
 ////            String allText = stripper.getText(document);
@@ -71,8 +84,9 @@ public class ReportParseUtil {
 //            PDFTextStripper textStripper = new CustomPDFTextStripper();
 //            textStripper.setSortByPosition(true);
 //            String text1 = textStripper.getText(document);
-//            text1 = text1.replace("+\r\n", "").replace("+","");
-//            List<String> textList = StrUtil.split(text1, "\r\n");
+//            text1 = text1.replace(Constants.WATERMARK_REPLACE, Constants.EMPTY);
+//            List<String> textList = StrUtil.split(text1, System.lineSeparator());
+//            textList.removeIf(StrUtil::isBlank);
 //            System.out.println(textList.get(0));
 //
 ////            for (PDPage page : document.getPages()) {
@@ -115,7 +129,7 @@ public class ReportParseUtil {
 //                                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
 //                            }
 //                        }
-//                        ReportFundInfo reportFundInfo = new ReportFundInfo();
+//                        ReportFundInfoDTO reportFundInfo = new ReportFundInfoDTO();
 //                        baseInfoMap.forEach((k, v) -> {
 //                            for (ValueLabelVO vo : fieldMapper) {
 //                                String fieldName = vo.getValue();
@@ -137,220 +151,220 @@ public class ReportParseUtil {
 //                }
 //            }
 //        }
-    }
-
-    /**
-     * 找图片水印
-     *
-     * @param page
-     * @return
-     * @throws IOException
-     */
-    public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
-        Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
-        PDResources resources = page.getResources();
-        Iterable<COSName> xObjectNames = resources.getXObjectNames();
-        for (COSName xObjectName : xObjectNames) {
-            PDXObject xObject = resources.getXObject(xObjectName);
-            PDStream stream = xObject.getStream();
-            PDImageXObject imageXObject = null;
-            try {
-                imageXObject = new PDImageXObject(stream, resources);
-            } catch (Exception e) {
-                e.printStackTrace();
-            }
-            if (imageXObject != null) {
-                watermarkMap.put(xObjectName, imageXObject);
-            }
-        }
-        return watermarkMap;
-    }
-
-    private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
-        Map<String, List<String>> result = MapUtil.newHashMap(32);
-        // 生成水印列表
-
-        fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
-        trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
-        registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
-        String text = fundName + trustName + registerNumber;
-        text = text.replaceAll("[()]", ""); // 移除括号
-        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
-        Collections.reverse(textList);
-        StringBuilder sb = new StringBuilder(textList.size());
-        for (String ch : textList) {
-            sb.append(ch);
-        }
-        String joinedText = sb.toString();
-
-        // 基本水印列表
-        List<String> wkList = new ArrayList<>();
-        for (String ch : textList) {
-            wkList.add(ch + "\r\n");
-            wkList.add("\r\n" + ch);
-        }
-
-        // 查找数字
-        List<String> matches = findDigits(fundName);
-        if (!matches.isEmpty()) {
-            for (String match : matches) {
-                wkList.add("\r\n" + match);
-                wkList.add(match + "\r\n");
-            }
-        }
-        wkList.add("-");
-        wkList.add("【");
-        wkList.add("】");
-        wkList.add("\r");
-        wkList.add("\r\n");
-
-        String noNumberText = removeDigits(joinedText);
-
-        // 生成不同字段的水印列表
-        result.put("report_name", new ArrayList<>(wkList));
-        result.get("report_name").addAll(convertStringToList("有限公司"));
-
-        result.put("less", new ArrayList<>(wkList));
-
-        result.put("more", new ArrayList<>(wkList));
-        result.get("more").addAll(convertStringToList(noNumberText));
-
-        result.put("leverage", new ArrayList<>(wkList));
-        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
-
-        result.put("base_info", new ArrayList<>(wkList));
-        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
-
-        result.put("industry", new ArrayList<>(wkList));
-        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
-
-        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
-        return result;
-    }
-
-    private static List<String> findDigits(String text) {
-        List<String> digits = new ArrayList<>();
-        Pattern pattern = Pattern.compile("\\d");
-        Matcher matcher = pattern.matcher(text);
-        while (matcher.find()) {
-            digits.add(matcher.group());
-        }
-        return digits;
-    }
-
-    private static String removeDigits(String text) {
-        return text.replaceAll("\\d", "");
-    }
-
-    private static String removeKeywords(String text, String... keywords) {
-        for (String keyword : keywords) {
-            text = text.replaceAll(keyword, "");
-        }
-        return text;
-    }
-
-    private static List<String> convertStringToList(String text) {
-        List<String> charList = new ArrayList<>();
-        for (char c : text.toCharArray()) {
-            charList.add(c + "");
-        }
-        return charList;
-    }
-
-    public static String processString(List<String> wmList, String string) {
-        // 生成正则表达式模式
-        String pat = String.join("|", wmList);
-        // 使用正则表达式移除wmList中的元素
-        string = removeMatches(string, pat);
-        // 替换中文括号为英文括号
-        string = string.replace("(", "(").replace(")", ")");
-        // 移除空格
-        string = string.replace(" ", "");
-        // 如果字符串以括号开头,则移除第一个字符
-        if (startsWithParenthesis(string)) {
-            string = string.substring(1);
-        }
-
-        return string;
-    }
-
-    private static String removeMatches(String input, String pattern) {
-        // 编译正则表达式
-        Pattern compiledPattern = Pattern.compile(pattern);
-        // 创建Matcher对象
-        Matcher matcher = compiledPattern.matcher(input);
-        // 使用replaceAll方法替换所有匹配到的字符为空字符串
-        return matcher.replaceAll("");
-    }
-
-    private static boolean startsWithParenthesis(String input) {
-        // 匹配以括号开头的字符串
-        Pattern pattern = Pattern.compile("^[()].*");
-        Matcher matcher = pattern.matcher(input);
-        return matcher.find();
-    }
-
-//    public static void removeTextWatermark(PDPage page) throws IOException {
+//    }
+//
+//    /**
+//     * 找图片水印
+//     *
+//     * @param page
+//     * @return
+//     * @throws IOException
+//     */
+//    public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
+//        Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
 //        PDResources resources = page.getResources();
-////        if (StrUtil.isAllBlank(fundName, trustName)) {
-////            return;
-////        }
-//        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
-//        stripper.setSortByPosition(true);
-//        stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
-//        stripper.extractRegions(page);
+//        Iterable<COSName> xObjectNames = resources.getXObjectNames();
+//        for (COSName xObjectName : xObjectNames) {
+//            PDXObject xObject = resources.getXObject(xObjectName);
+//            PDStream stream = xObject.getStream();
+//            PDImageXObject imageXObject = null;
+//            try {
+//                imageXObject = new PDImageXObject(stream, resources);
+//            } catch (Exception e) {
+//                e.printStackTrace();
+//            }
+//            if (imageXObject != null) {
+//                watermarkMap.put(xObjectName, imageXObject);
+//            }
+//        }
+//        return watermarkMap;
+//    }
 //
-//        PDFStreamEngine engine = new PDFTextStripper();
-//        engine.addOperator(new SetMatrix(stripper));
+//    private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
+//        Map<String, List<String>> result = MapUtil.newHashMap(32);
+//        // 生成水印列表
 //
-//    }
+//        fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
+//        trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
+//        registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
+//        String text = fundName + trustName + registerNumber;
+//        text = text.replaceAll("[()]", ""); // 移除括号
+//        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
+//        Collections.reverse(textList);
+//        StringBuilder sb = new StringBuilder(textList.size());
+//        for (String ch : textList) {
+//            sb.append(ch);
+//        }
+//        String joinedText = sb.toString();
 //
-//    private static void processResources(PDResources resources) throws IOException {
-//        for (COSName name : resources.getXObjectNames()) {
-//            PDXObject xobject = resources.getXObject(name);
-//            if (xobject instanceof PDFormXObject) {
-//                PDFormXObject formXObject = (PDFormXObject) xobject;
-//                writeTokensToStream(formXObject.getContentStream(),
-//                        createTokensWithoutText(formXObject));
-//                processResources(formXObject.getResources());
-//            }
+//        // 基本水印列表
+//        List<String> wkList = new ArrayList<>();
+//        for (String ch : textList) {
+//            wkList.add(ch + "\r\n");
+//            wkList.add("\r\n" + ch);
 //        }
-//        for (COSName name : resources.getPatternNames()) {
-//            PDAbstractPattern pattern = resources.getPattern(name);
-//            if (pattern instanceof PDTilingPattern) {
-//                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
-//                writeTokensToStream(tilingPattern.getContentStream(),
-//                        createTokensWithoutText(tilingPattern));
-//                processResources(tilingPattern.getResources());
+//
+//        // 查找数字
+//        List<String> matches = findDigits(fundName);
+//        if (!matches.isEmpty()) {
+//            for (String match : matches) {
+//                wkList.add("\r\n" + match);
+//                wkList.add(match + "\r\n");
 //            }
 //        }
+//        wkList.add("-");
+//        wkList.add("【");
+//        wkList.add("】");
+//        wkList.add("\r");
+//        wkList.add("\r\n");
+//
+//        String noNumberText = removeDigits(joinedText);
+//
+//        // 生成不同字段的水印列表
+//        result.put("report_name", new ArrayList<>(wkList));
+//        result.get("report_name").addAll(convertStringToList("有限公司"));
+//
+//        result.put("less", new ArrayList<>(wkList));
+//
+//        result.put("more", new ArrayList<>(wkList));
+//        result.get("more").addAll(convertStringToList(noNumberText));
+//
+//        result.put("leverage", new ArrayList<>(wkList));
+//        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
+//
+//        result.put("base_info", new ArrayList<>(wkList));
+//        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
+//
+//        result.put("industry", new ArrayList<>(wkList));
+//        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
+//
+//        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
+//        return result;
 //    }
 //
-//    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
-//        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
-//            ContentStreamWriter writer = new ContentStreamWriter(out);
-//            writer.writeTokens(newTokens);
+//    private static List<String> findDigits(String text) {
+//        List<String> digits = new ArrayList<>();
+//        Pattern pattern = Pattern.compile("\\d");
+//        Matcher matcher = pattern.matcher(text);
+//        while (matcher.find()) {
+//            digits.add(matcher.group());
 //        }
+//        return digits;
 //    }
 //
-//    private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
-//        PDFStreamParser parser = new PDFStreamParser(contentStream);
-//        Object token = parser.parseNextToken();
-//        List<Object> newTokens = new ArrayList<>();
-//        while (token != null) {
-//            if (token instanceof Operator op) {
-//                String opName = op.getName();
-//                if (OperatorName.SET_MATRIX.equals(opName)) {
-//                    // remove the argument to this operator
-//                    newTokens.remove(newTokens.size() - 1);
+//    private static String removeDigits(String text) {
+//        return text.replaceAll("\\d", "");
+//    }
 //
-//                    token = parser.parseNextToken();
-//                    continue;
-//                }
-//            }
-//            newTokens.add(token);
-//            token = parser.parseNextToken();
+//    private static String removeKeywords(String text, String... keywords) {
+//        for (String keyword : keywords) {
+//            text = text.replaceAll(keyword, "");
+//        }
+//        return text;
+//    }
+//
+//    private static List<String> convertStringToList(String text) {
+//        List<String> charList = new ArrayList<>();
+//        for (char c : text.toCharArray()) {
+//            charList.add(c + "");
+//        }
+//        return charList;
+//    }
+//
+//    public static String processString(List<String> wmList, String string) {
+//        // 生成正则表达式模式
+//        String pat = String.join("|", wmList);
+//        // 使用正则表达式移除wmList中的元素
+//        string = removeMatches(string, pat);
+//        // 替换中文括号为英文括号
+//        string = string.replace("(", "(").replace(")", ")");
+//        // 移除空格
+//        string = string.replace(" ", "");
+//        // 如果字符串以括号开头,则移除第一个字符
+//        if (startsWithParenthesis(string)) {
+//            string = string.substring(1);
 //        }
-//        return newTokens;
+//
+//        return string;
+//    }
+//
+//    private static String removeMatches(String input, String pattern) {
+//        // 编译正则表达式
+//        Pattern compiledPattern = Pattern.compile(pattern);
+//        // 创建Matcher对象
+//        Matcher matcher = compiledPattern.matcher(input);
+//        // 使用replaceAll方法替换所有匹配到的字符为空字符串
+//        return matcher.replaceAll("");
 //    }
-}
+//
+//    private static boolean startsWithParenthesis(String input) {
+//        // 匹配以括号开头的字符串
+//        Pattern pattern = Pattern.compile("^[()].*");
+//        Matcher matcher = pattern.matcher(input);
+//        return matcher.find();
+//    }
+//
+////    public static void removeTextWatermark(PDPage page) throws IOException {
+////        PDResources resources = page.getResources();
+//////        if (StrUtil.isAllBlank(fundName, trustName)) {
+//////            return;
+//////        }
+////        PDFTextStripperByArea stripper = new PDFTextStripperByArea();
+////        stripper.setSortByPosition(true);
+////        stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
+////        stripper.extractRegions(page);
+////
+////        PDFStreamEngine engine = new PDFTextStripper();
+////        engine.addOperator(new SetMatrix(stripper));
+////
+////    }
+////
+////    private static void processResources(PDResources resources) throws IOException {
+////        for (COSName name : resources.getXObjectNames()) {
+////            PDXObject xobject = resources.getXObject(name);
+////            if (xobject instanceof PDFormXObject) {
+////                PDFormXObject formXObject = (PDFormXObject) xobject;
+////                writeTokensToStream(formXObject.getContentStream(),
+////                        createTokensWithoutText(formXObject));
+////                processResources(formXObject.getResources());
+////            }
+////        }
+////        for (COSName name : resources.getPatternNames()) {
+////            PDAbstractPattern pattern = resources.getPattern(name);
+////            if (pattern instanceof PDTilingPattern) {
+////                PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
+////                writeTokensToStream(tilingPattern.getContentStream(),
+////                        createTokensWithoutText(tilingPattern));
+////                processResources(tilingPattern.getResources());
+////            }
+////        }
+////    }
+////
+////    private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
+////        try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
+////            ContentStreamWriter writer = new ContentStreamWriter(out);
+////            writer.writeTokens(newTokens);
+////        }
+////    }
+////
+////    private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
+////        PDFStreamParser parser = new PDFStreamParser(contentStream);
+////        Object token = parser.parseNextToken();
+////        List<Object> newTokens = new ArrayList<>();
+////        while (token != null) {
+////            if (token instanceof Operator op) {
+////                String opName = op.getName();
+////                if (OperatorName.SET_MATRIX.equals(opName)) {
+////                    // remove the argument to this operator
+////                    newTokens.remove(newTokens.size() - 1);
+////
+////                    token = parser.parseNextToken();
+////                    continue;
+////                }
+////            }
+////            newTokens.add(token);
+////            token = parser.parseNextToken();
+////        }
+////        return newTokens;
+////    }
+//}

+ 61 - 0
service-daq/src/main/java/technology/tabula/CustomObjectExtractor.java

@@ -0,0 +1,61 @@
+package technology.tabula;
+
+import com.simuwang.daq.components.CustomTabulaTextStripper;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+
+import java.io.IOException;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/30 18:08
+ * @description 自定义的pdf表格提取,重写的目的是为了让自定义的去水印的文本提起工具生效
+ * @see CustomTabulaTextStripper
+ */
+public class CustomObjectExtractor extends ObjectExtractor {
+    private final PDDocument pdfDocument;
+
+    public CustomObjectExtractor(PDDocument pdfDocument) {
+        super(pdfDocument);
+        this.pdfDocument = pdfDocument;
+    }
+
+    @Override
+    protected Page extractPage(Integer pageNumber) throws IOException {
+        if (pageNumber > pdfDocument.getNumberOfPages() || pageNumber < 1) {
+            throw new java.lang.IndexOutOfBoundsException("Page number does not exist.");
+        }
+        PDPage page = pdfDocument.getPage(pageNumber - 1);
+
+        ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
+        streamEngine.processPage(page);
+
+        CustomTabulaTextStripper textStripper = new CustomTabulaTextStripper(pdfDocument, pageNumber);
+        textStripper.process();
+
+        Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER);
+
+        float width, height;
+        int rotation = page.getRotation();
+        if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) {
+            width = page.getCropBox().getHeight();
+            height = page.getCropBox().getWidth();
+        } else {
+            width = page.getCropBox().getWidth();
+            height = page.getCropBox().getHeight();
+        }
+
+        return Page.Builder.newInstance()
+                .withPageDims(PageDims.of(0, 0, width, height))
+                .withRotation(rotation)
+                .withNumber(pageNumber)
+                .withPdPage(page)
+                .withPdDocument(pdfDocument)
+                .withRulings(streamEngine.rulings)
+                .withTextElements(textStripper.getTextElements())
+                .withMinCharWidth(textStripper.getMinCharWidth())
+                .withMinCharHeight(textStripper.getMinCharHeight())
+                .withIndex(textStripper.getSpatialIndex())
+                .build();
+    }
+}

+ 0 - 12
service-deploy/pom.xml

@@ -58,18 +58,6 @@
                 </exclusion>
             </exclusions>
         </dependency>
-
-        <dependency>
-            <groupId>org.springframework.boot</groupId>
-            <artifactId>spring-boot-devtools</artifactId>
-            <scope>runtime</scope>
-            <optional>true</optional>
-        </dependency>
-        <dependency>
-            <groupId>org.springframework.boot</groupId>
-            <artifactId>spring-boot-configuration-processor</artifactId>
-            <optional>true</optional>
-        </dependency>
     </dependencies>
 
     <build>

+ 2 - 0
service-deploy/src/main/resources/application.yml

@@ -81,6 +81,8 @@ simuwang:
   # token过期时间,单位:分钟
   token-expire: 1440
   token-secret: qwertyuiopasdfghjklzxcvbnm1234567890qwertyuiopasdfghjklzxcvbnm12
+  # 是否开启python的报告解析功能,开启后报告全部用python接口来解析;当开启时要配置如下python解析地址
+  enable-py-parser: false
   py-base-url: "http://192.168.1.224:8088"
   # rsa 公钥私钥配置
   security-rsa:

+ 21 - 9
service-deploy/src/test/java/com/simuwang/ApplicationTest.java

@@ -25,13 +25,7 @@ public class ApplicationTest {
 
     @Test
     public void test() {
-        MailboxInfoDTO emailInfoDTO = new MailboxInfoDTO();
-        emailInfoDTO.setUserId(2395446);
-        emailInfoDTO.setAccount("mozuwen@simuwang.com");
-        emailInfoDTO.setPassword("Mzw@0306");
-        emailInfoDTO.setHost("imap.exmail.qq.com");
-        emailInfoDTO.setPort("993");
-        emailInfoDTO.setProtocol("imap");
+        MailboxInfoDTO emailInfoDTO = this.buildMailbox();
 //
 //        emailInfoDTO.setAccount("jjpj_test");
 //        emailInfoDTO.setPassword("shzq#919");
@@ -49,8 +43,15 @@ public class ApplicationTest {
     }
 
     @Test
-    public void pyTest() {
-
+    public void reportTest() {
+        MailboxInfoDTO emailInfoDTO = this.buildMailbox();
+        Date startDate = DateUtil.parse("2024-10-11 08:30:30", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2024-10-11 09:59:30", DateConst.YYYY_MM_DD_HH_MM_SS);
+        try {
+            emailParseService.parseEmail(emailInfoDTO, startDate, endDate);
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
     }
 
     @Test
@@ -72,4 +73,15 @@ public class ApplicationTest {
             System.out.println(dateString + ": -> " + date);
         }
     }
+
+    private MailboxInfoDTO buildMailbox() {
+        MailboxInfoDTO emailInfoDTO = new MailboxInfoDTO();
+        emailInfoDTO.setUserId(1);
+        emailInfoDTO.setAccount("*");
+        emailInfoDTO.setPassword("*");
+        emailInfoDTO.setHost("imap.exmail.qq.com");
+        emailInfoDTO.setPort("993");
+        emailInfoDTO.setProtocol("imap");
+        return emailInfoDTO;
+    }
 }

+ 0 - 21
service-manage/src/main/java/com/simuwang/manage/api/test/ReportParseTestApi.java

@@ -1,21 +0,0 @@
-package com.simuwang.manage.api.test;
-
-import com.simuwang.daq.service.ReportParseService;
-import org.springframework.web.bind.annotation.GetMapping;
-import org.springframework.web.bind.annotation.RequestMapping;
-import org.springframework.web.bind.annotation.RestController;
-
-@RestController
-@RequestMapping("/v1/test/parse")
-public class ReportParseTestApi {
-    private final ReportParseService service;
-
-    public ReportParseTestApi(ReportParseService service) {
-        this.service = service;
-    }
-
-    @GetMapping("monthly")
-    public void monthly() {
-        this.service.parse();
-    }
-}