Просмотр исходного кода

feat:报告解析转java之pdf抽象类提取

wangzaijun 7 месяцев назад
Родитель
Сommit
d8f6a907f3

+ 5 - 0
service-base/src/main/java/com/simuwang/base/common/conts/Constants.java

@@ -1,12 +1,17 @@
 package com.simuwang.base.common.conts;
 
 
+import cn.hutool.core.util.StrUtil;
+
 /**
  * 通用常量信息
  *
  * @author ruoyi
  */
 public class Constants {
+    public static final String EMPTY = StrUtil.EMPTY;
+    public static final String WATERMARK_REPLACE = "+_+";
+
     public static final long DEFAULT_SERIAL_ID = 999L;
 
     /**

+ 19 - 18
service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java

@@ -2,12 +2,13 @@ package com.simuwang.daq.components;
 
 import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.collection.ListUtil;
+import com.simuwang.base.common.conts.Constants;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
-import org.apache.pdfbox.util.Matrix;
 
 import java.io.IOException;
 import java.util.List;
+import java.util.stream.Collectors;
 
 /**
  * @author wangzaijun
@@ -15,28 +16,28 @@ import java.util.List;
  * @description 自定义的文本去水印方法,发现水印基本是旋转文字并且比报告内其他文字都大
  */
 public class CustomPDFTextStripper extends PDFTextStripper {
-    private final float[] watermarkWidth = {0f};
-
     @Override
     protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
+        // 水印文字基本都是有角度的,统计有旋转角度的文字宽度
+        List<Float> weights = textPositions.stream().filter(e -> e.getTextMatrix().getValue(0, 1) != 0.)
+                .map(TextPosition::getWidth).collect(Collectors.toList());
+        // 集合为空表示text的内容没有水印影响,直接输出该内容
+        if (CollUtil.isEmpty(weights)) {
+            super.writeString(text);
+            return;
+        }
+        // 如果全是水印文字则直接去除
+        if (textPositions.size() == weights.size()) {
+            super.writeString(Constants.WATERMARK_REPLACE);
+            return;
+        }
+        // 否则去除水印(文字没有旋转角度,并且水印字体大小没有包含当前文字时说明是正常文字;否则识别为水印并用特殊符号代替)
         List<String> newTexts = ListUtil.list(false);
         for (TextPosition textPosition : textPositions) {
-            Matrix textMatrix = textPosition.getTextMatrix();
-            float col = textMatrix.getValue(0, 1);
+            float col = textPosition.getTextMatrix().getValue(0, 1);
             float width = textPosition.getWidth();
-            if (col == 0.) {
-                if (width < watermarkWidth[0]) {
-                    newTexts.add(textPosition.getUnicode());
-                }
-            } else {
-                if (width > watermarkWidth[0]) {
-                    watermarkWidth[0] = width;
-                }
-                newTexts.add("+_+");
-            }
-        }
-        if (CollUtil.isNotEmpty(newTexts)) {
-            super.writeString(String.join("", newTexts));
+            newTexts.add(col == 0. && !weights.contains(width) ? textPosition.getUnicode() : Constants.WATERMARK_REPLACE);
         }
+        super.writeString(String.join(Constants.EMPTY, newTexts));
     }
 }

+ 36 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java

@@ -0,0 +1,36 @@
+package com.simuwang.daq.components.report.parser;
+
+import cn.hutool.core.collection.CollUtil;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
+import com.simuwang.base.pojo.dto.report.ReportData;
+import com.smppw.common.pojo.ValueLabelVO;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
+public abstract class AbstractReportParser<T extends ReportData> implements ReportParser<T> {
+    protected final Logger logger = LoggerFactory.getLogger(this.getClass());
+
+    private final EmailFieldMappingMapper fieldMappingMapper;
+    /**
+     * 字段匹配规则
+     */
+    protected List<ValueLabelVO> fieldMapper;
+
+    public AbstractReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        this.fieldMappingMapper = fieldMappingMapper;
+    }
+
+    @Override
+    public void init() {
+        List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
+        if (CollUtil.isEmpty(emailFieldMapping)) {
+            this.logger.error("未设置报告解析规则!");
+            return;
+        }
+        this.fieldMapper = emailFieldMapping.stream().map(e -> new ValueLabelVO(e.getCode(), e.getName())).collect(Collectors.toList());
+    }
+}

+ 7 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParser.java

@@ -12,6 +12,13 @@ import java.io.IOException;
  */
 public interface ReportParser<T extends ReportData> {
     /**
+     * 初始化方法,该方法在执行前调用
+     */
+    default void init() {
+
+    }
+
+    /**
      * 获取当前解析器名称
      *
      * @return /

+ 212 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java

@@ -0,0 +1,212 @@
+package com.simuwang.daq.components.report.parser.pdf;
+
+import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.conts.Constants;
+import com.simuwang.base.common.exception.APIException;
+import com.simuwang.base.mapper.EmailFieldMappingMapper;
+import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
+import com.simuwang.base.pojo.dto.report.ReportData;
+import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
+import com.simuwang.base.pojo.dto.report.ReportParserParams;
+import com.simuwang.daq.components.CustomPDFTextStripper;
+import com.simuwang.daq.components.report.parser.AbstractReportParser;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import technology.tabula.ObjectExtractor;
+import technology.tabula.Page;
+import technology.tabula.PageIterator;
+import technology.tabula.Table;
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+
+import java.io.IOException;
+import java.util.Calendar;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/29 16:45
+ * @description pdf格式的报告解析抽象类
+ */
+public abstract class AbstractPDReportParser<T extends ReportData> extends AbstractReportParser<T> {
+    public AbstractPDReportParser(EmailFieldMappingMapper fieldMappingMapper) {
+        super(fieldMappingMapper);
+    }
+
+    @Override
+    public T parse(ReportParserParams params) throws IOException {
+        this.init();
+        // 解析报告名称和表格
+        String reportName = null;
+        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
+            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
+            stripper.setSortByPosition(true);
+            String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, Constants.EMPTY);
+            List<String> textList = StrUtil.split(text, System.lineSeparator());
+            textList.removeIf(StrUtil::isBlank);
+            if (CollUtil.isNotEmpty(textList)) {
+                reportName = this.matchReportName(textList.get(0));
+                if (StrUtil.isBlank(reportName)) {
+                    throw new APIException("未匹配到报告名称");
+                }
+            }
+            // 解析所有表格
+            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
+            PageIterator pageIterator = new ObjectExtractor(document).extract();
+            while (pageIterator.hasNext()) {
+                Page page = pageIterator.next();
+                List<Table> tables = extractionAlgorithm.extract(page);
+                tables = tables.stream().distinct().collect(Collectors.toList());
+                this.initTableInfo(tables);
+            }
+        }
+        // 解析报告中主体基金的基本信息
+        ReportFundInfoDTO reportFundInfo = this.parseFundInfo(params);
+        // 解析其他表格信息并且设置结果字段
+        return this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
+    }
+
+    protected abstract void initTableInfo(List<Table> tables);
+
+    protected abstract ReportFundInfoDTO parseFundInfo(ReportParserParams params);
+
+    protected abstract T parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params);
+
+    /**
+     * 构建报告基本信息
+     *
+     * @param fileId     文件id
+     * @param reportName 报告名称
+     * @return /
+     */
+    protected ReportBaseInfoDTO buildReportInfo(Integer fileId, String reportName) {
+        ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO();
+        reportInfo.setFileId(fileId);
+        reportInfo.setReportName(reportName);
+        reportInfo.setReportType(this.matchReportType(reportName));
+        reportInfo.setReportDate(this.matchReportDate(reportName));
+        return reportInfo;
+    }
+
+    /**
+     * 匹配报告名称
+     *
+     * @param text 文本内容
+     * @return /
+     */
+    private String matchReportName(String text) {
+        if (StrUtil.isBlank(text)) {
+            return null;
+        }
+        // 编译正则表达式模式
+        Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
+        Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
+        Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
+
+        // 创建Matcher对象
+        Matcher matcher1 = pat1.matcher(text);
+        Matcher matcher2 = pat2.matcher(text);
+        Matcher matcher3 = pat3.matcher(text);
+
+        // 尝试匹配
+        String reportName;
+        if (matcher1.find()) {
+            reportName = matcher1.group();
+        } else if (matcher2.find()) {
+            reportName = matcher2.group();
+        } else if (matcher3.find()) {
+            reportName = matcher3.group();
+        } else {
+            reportName = text;
+        }
+        return reportName.replace("(", "(").replace(")", ")");
+    }
+
+    /**
+     * 匹配报告日期
+     *
+     * @param string 文本内容
+     * @return 报告日期
+     */
+    private String matchReportDate(String string) {
+        if (string == null) {
+            return null;
+        }
+
+        // 编译正则表达式模式
+        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
+        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
+        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
+        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
+
+        // 创建Matcher对象
+        Matcher matcher1 = pat1.matcher(string);
+        Matcher matcher2 = pat2.matcher(string);
+        Matcher matcher3 = pat3.matcher(string);
+        Matcher matcher4 = pat4.matcher(string);
+
+        // 尝试匹配
+        if (matcher1.find()) {
+            String year = matcher1.group(1);
+            String quarter = matcher1.group(2);
+            return switch (quarter) {
+                case "一", "1" -> year + "-03-31";
+                case "二", "2" -> year + "-06-30";
+                case "三", "3" -> year + "-09-30";
+                case "四", "4" -> year + "-12-31";
+                default -> null;
+            };
+        } else if (matcher2.find()) {
+            return matcher2.group();
+        } else if (matcher3.find()) {
+            return matcher3.group(1) + "-12-31";
+        } else if (matcher4.find()) {
+            String year = matcher4.group(1);
+            String month = matcher4.group(2);
+            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
+            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
+        } else {
+            return null;
+        }
+    }
+
+    /**
+     * 匹配报告类型,如“季度”、“年度”
+     *
+     * @param string 输入字符串
+     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
+     */
+    private String matchReportType(String string) {
+        if (string == null) {
+            return null;
+        }
+
+        // 编译正则表达式模式
+        Pattern pattern = Pattern.compile("月|季度|年度");
+
+        // 创建Matcher对象
+        Matcher matcher = pattern.matcher(string);
+
+        // 尝试匹配
+        if (matcher.find()) {
+            return matcher.group();
+        } else {
+            return null;
+        }
+    }
+
+    private int getLastDayOfMonth(int year, int month) {
+        Calendar calendar = Calendar.getInstance();
+        calendar.set(Calendar.YEAR, year);
+        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
+        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
+    }
+
+    private String padZero(String number) {
+        return String.format("%02d", Integer.parseInt(number));
+    }
+}

+ 0 - 275
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractReportParser.java

@@ -1,275 +0,0 @@
-package com.simuwang.daq.components.report.parser.pdf;
-
-import cn.hutool.core.collection.CollUtil;
-import com.simuwang.base.mapper.EmailFieldMappingMapper;
-import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
-import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
-import com.simuwang.base.pojo.dto.report.ReportData;
-import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
-import com.simuwang.base.pojo.dto.report.ReportParserParams;
-import com.simuwang.daq.components.report.parser.ReportParser;
-import com.smppw.common.pojo.ValueLabelVO;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.util.Calendar;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-public abstract class AbstractReportParser<T extends ReportData> implements ReportParser<T> {
-    protected final Logger logger = LoggerFactory.getLogger(this.getClass());
-
-    private final EmailFieldMappingMapper fieldMappingMapper;
-    /**
-     * 字段匹配规则
-     */
-    protected List<ValueLabelVO> fieldMapper;
-
-    public AbstractReportParser(EmailFieldMappingMapper fieldMappingMapper) {
-        this.fieldMappingMapper = fieldMappingMapper;
-    }
-
-    @Override
-    public T parse(ReportParserParams params) throws IOException {
-        List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
-        if (CollUtil.isEmpty(emailFieldMapping)) {
-            this.logger.error("未设置报告解析规则!");
-            return null;
-        }
-        this.fieldMapper = emailFieldMapping.stream().map(e -> new ValueLabelVO(e.getCode(), e.getName())).collect(Collectors.toList());
-        String reportName = this.initAndGetReportName(params);
-        ReportBaseInfoDTO reportInfo = this.buildReportInfo(params, reportName);
-        ReportFundInfoDTO reportFundInfo = this.parseBaseInfo(params);
-        return this.parseExtInfoAndSetData(reportInfo, reportFundInfo, params);
-    }
-
-    protected abstract String initAndGetReportName(ReportParserParams params) throws IOException;
-
-    private ReportBaseInfoDTO buildReportInfo(ReportParserParams params, String reportName) {
-        ReportBaseInfoDTO reportInfo = new ReportBaseInfoDTO();
-        reportInfo.setFileId(params.getFileId());
-        reportInfo.setReportName(reportName);
-        reportInfo.setReportType(this.matchReportType(reportName));
-        reportInfo.setReportDate(this.matchReportDate(reportName));
-        return reportInfo;
-    }
-
-    protected abstract ReportFundInfoDTO parseBaseInfo(ReportParserParams params);
-
-    protected abstract T parseExtInfoAndSetData(ReportBaseInfoDTO baseInfo, ReportFundInfoDTO fundInfo, ReportParserParams params);
-
-//    protected abstract List<EXT> parseExtInfo();
-
-//    protected abstract void saveResult(ReportInfo reportInfo, ReportFundInfo reportFundInfo, List<EXT> exts);
-
-//    private Map<String, List<String>> generateWatermarkMap(String watermarkName) {
-//        Map<String, List<String>> result = MapUtil.newHashMap(32);
-//        // 生成水印列表
-//        String text = watermarkName;
-//        text = text.replaceAll("[()]", ""); // 移除括号
-//        List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
-//        Collections.reverse(textList);
-//        StringBuilder sb = new StringBuilder(textList.size());
-//        for (String ch : textList) {
-//            sb.append(ch);
-//        }
-//        String joinedText = sb.toString();
-//
-//        // 基本水印列表
-//        List<String> wkList = new ArrayList<>();
-//        for (String ch : textList) {
-//            wkList.add(ch + "\r\n");
-//            wkList.add("\r\n" + ch);
-//        }
-//
-//        // 查找数字
-//        List<String> matches = findDigits(watermarkName);
-//        if (!matches.isEmpty()) {
-//            for (String match : matches) {
-//                wkList.add("\r\n" + match);
-//                wkList.add(match + "\r\n");
-//            }
-//        }
-//        wkList.add("-");
-//        wkList.add("【");
-//        wkList.add("】");
-//        wkList.add("\r");
-//        wkList.add("\n");
-//        wkList.add("\r\n");
-//
-//        String noNumberText = removeDigits(joinedText);
-//
-//        // 生成不同字段的水印列表
-//        result.put("report_name", new ArrayList<>(wkList));
-//        result.get("report_name").addAll(convertStringToList("有限公司"));
-//
-//        result.put("less", new ArrayList<>(wkList));
-//
-//        result.put("more", new ArrayList<>(wkList));
-//        result.get("more").addAll(convertStringToList(noNumberText));
-//
-//        result.put("leverage", new ArrayList<>(wkList));
-//        result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
-//
-//        result.put("base_info", new ArrayList<>(wkList));
-//        result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
-//
-//        result.put("industry", new ArrayList<>(wkList));
-//        result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
-//
-//        result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
-//        return result;
-//    }
-
-//    private List<String> findDigits(String text) {
-//        List<String> digits = new ArrayList<>();
-//        Pattern pattern = Pattern.compile("\\d");
-//        Matcher matcher = pattern.matcher(text);
-//        while (matcher.find()) {
-//            digits.add(matcher.group());
-//        }
-//        return digits;
-//    }
-//
-//    private String removeDigits(String text) {
-//        return text.replaceAll("\\d", "");
-//    }
-//
-//    private String removeKeywords(String text, String... keywords) {
-//        for (String keyword : keywords) {
-//            text = text.replaceAll(keyword, "");
-//        }
-//        return text;
-//    }
-//
-//    private List<String> convertStringToList(String text) {
-//        List<String> charList = new ArrayList<>();
-//        for (char c : text.toCharArray()) {
-//            charList.add(c + "");
-//        }
-//        return charList;
-//    }
-
-//    protected String processString(List<String> wmList, String string) {
-//        if (StrUtil.isBlank(string)) {
-//            return null;
-//        }
-//        // 生成正则表达式模式
-//        String pat = String.join("|", wmList);
-//        // 使用正则表达式移除wmList中的元素
-//        string = removeMatches(string, pat);
-//        // 替换中文括号为英文括号
-//        string = string.replace("(", "(").replace(")", ")");
-//        // 移除空格
-//        string = string.replace(" ", "");
-//        // 如果字符串以括号开头,则移除第一个字符
-//        if (startsWithParenthesis(string)) {
-//            string = string.substring(1);
-//        }
-//
-//        return string;
-//    }
-
-//    private String removeMatches(String input, String pattern) {
-//        // 编译正则表达式
-//        Pattern compiledPattern = Pattern.compile(pattern);
-//        // 创建Matcher对象
-//        Matcher matcher = compiledPattern.matcher(input);
-//        // 使用replaceAll方法替换所有匹配到的字符为空字符串
-//        return matcher.replaceAll("");
-//    }
-//
-//    private boolean startsWithParenthesis(String input) {
-//        // 匹配以括号开头的字符串
-//        Pattern pattern = Pattern.compile("^[()].*");
-//        Matcher matcher = pattern.matcher(input);
-//        return matcher.find();
-//    }
-
-    /**
-     * 匹配报告日期
-     *
-     * @param string 文本内容
-     * @return 报告日期
-     */
-    private String matchReportDate(String string) {
-        if (string == null) {
-            return null;
-        }
-
-        // 编译正则表达式模式
-        Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
-        Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
-        Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
-        Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
-
-        // 创建Matcher对象
-        Matcher matcher1 = pat1.matcher(string);
-        Matcher matcher2 = pat2.matcher(string);
-        Matcher matcher3 = pat3.matcher(string);
-        Matcher matcher4 = pat4.matcher(string);
-
-        // 尝试匹配
-        if (matcher1.find()) {
-            String year = matcher1.group(1);
-            String quarter = matcher1.group(2);
-            return switch (quarter) {
-                case "一", "1" -> year + "-03-31";
-                case "二", "2" -> year + "-06-30";
-                case "三", "3" -> year + "-09-30";
-                case "四", "4" -> year + "-12-31";
-                default -> null;
-            };
-        } else if (matcher2.find()) {
-            return matcher2.group();
-        } else if (matcher3.find()) {
-            return matcher3.group(1) + "-12-31";
-        } else if (matcher4.find()) {
-            String year = matcher4.group(1);
-            String month = matcher4.group(2);
-            int lastDayOfMonth = getLastDayOfMonth(Integer.parseInt(year), Integer.parseInt(month));
-            return year + "-" + padZero(month) + "-" + padZero(lastDayOfMonth + "");
-        } else {
-            return null;
-        }
-    }
-
-    /**
-     * 匹配报告类型,如“季度”、“年度”
-     *
-     * @param string 输入字符串
-     * @return 匹配到的报告类型子字符串,如果没有匹配到则返回null
-     */
-    private String matchReportType(String string) {
-        if (string == null) {
-            return null;
-        }
-
-        // 编译正则表达式模式
-        Pattern pattern = Pattern.compile("月|季度|年度");
-
-        // 创建Matcher对象
-        Matcher matcher = pattern.matcher(string);
-
-        // 尝试匹配
-        if (matcher.find()) {
-            return matcher.group();
-        } else {
-            return null;
-        }
-    }
-
-    private int getLastDayOfMonth(int year, int month) {
-        Calendar calendar = Calendar.getInstance();
-        calendar.set(Calendar.YEAR, year);
-        calendar.set(Calendar.MONTH, month - 1); // Calendar.MONTH 是从0开始的
-        return calendar.getActualMaximum(Calendar.DAY_OF_MONTH);
-    }
-
-    private String padZero(String number) {
-        return String.format("%02d", Integer.parseInt(number));
-    }
-}

+ 10 - 10
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java

@@ -2,37 +2,37 @@ package com.simuwang.daq.components.report.parser.pdf;
 
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.AnnuallyReportData;
-import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
 import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
 import com.simuwang.base.pojo.dto.report.ReportParserParams;
 import com.simuwang.daq.components.report.parser.ReportParserConstant;
 import org.springframework.stereotype.Component;
+import technology.tabula.Table;
 
-import java.io.IOException;
+import java.util.List;
 
 @Component(ReportParserConstant.PARSER_PDF_ANNUALLY)
-public class PDAnnuallyReportParser extends AbstractReportParser<AnnuallyReportData> {
+public class PDAnnuallyReportParser extends AbstractPDReportParser<AnnuallyReportData> {
     public PDAnnuallyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
         super(fieldMappingMapper);
     }
 
     @Override
-    protected String initAndGetReportName(ReportParserParams params) throws IOException {
-        return null;
+    public String getParser() {
+        return ReportParserConstant.PARSER_PDF_ANNUALLY;
     }
 
     @Override
-    protected ReportFundInfoDTO parseBaseInfo(ReportParserParams params) {
-        return null;
+    protected void initTableInfo(List<Table> tables) {
+
     }
 
     @Override
-    protected AnnuallyReportData parseExtInfoAndSetData(ReportBaseInfoDTO baseInfo, ReportFundInfoDTO fundInfo, ReportParserParams params) {
+    protected ReportFundInfoDTO parseFundInfo(ReportParserParams params) {
         return null;
     }
 
     @Override
-    public String getParser() {
-        return ReportParserConstant.PARSER_PDF_ANNUALLY;
+    protected AnnuallyReportData parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params) {
+        return null;
     }
 }

+ 20 - 80
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDMonthlyReportParser.java

@@ -1,29 +1,21 @@
 package com.simuwang.daq.components.report.parser.pdf;
 
-import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.ReflectUtil;
 import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.conts.Constants;
 import com.simuwang.base.common.exception.APIException;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.*;
-import com.simuwang.daq.components.CustomPDFTextStripper;
 import com.simuwang.daq.components.report.parser.ReportParserConstant;
 import com.smppw.common.pojo.ValueLabelVO;
-import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
-import org.apache.pdfbox.pdmodel.PDDocument;
 import org.springframework.stereotype.Component;
-import technology.tabula.*;
-import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+import technology.tabula.RectangularTextContainer;
+import technology.tabula.Table;
 
-import java.io.IOException;
 import java.util.List;
 import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
 
 /**
  * @author wangzaijun
@@ -31,9 +23,9 @@ import java.util.stream.Collectors;
  * @description pdf格式的月报解析
  */
 @Component(ReportParserConstant.PARSER_PDF_MONTHLY)
-public class PDMonthlyReportParser extends AbstractReportParser<MonthlyReportData> {
+public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportData> {
     private final List<Table> extNavTables = ListUtil.list(true);
-    private Table baseInfoTable = null;
+    private Table fundInfoTable = null;
 
     public PDMonthlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
         super(fieldMappingMapper);
@@ -45,50 +37,26 @@ public class PDMonthlyReportParser extends AbstractReportParser<MonthlyReportDat
     }
 
     @Override
-    protected String initAndGetReportName(ReportParserParams params) throws IOException {
-        String reportName = null;
-        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
-            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
-            stripper.setSortByPosition(true);
-            String text = stripper.getText(document);
-            text = text.replace("+_+\r\n", "").replace("+_+", "");
-            List<String> textList = StrUtil.split(text, "\r\n");
-            if (CollUtil.isNotEmpty(textList)) {
-                String name = textList.get(0);
-                reportName = this.matchReportName(name);
-                if (StrUtil.isBlank(reportName)) {
-                    throw new APIException("未匹配到报告名称");
-                }
-            }
-
-            SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
-            PageIterator pageIterator = new ObjectExtractor(document).extract();
-            while (pageIterator.hasNext()) {
-                Page page = pageIterator.next();
-                List<Table> tables = extractionAlgorithm.extract(page);
-                tables = tables.stream().distinct().collect(Collectors.toList());
-                for (Table table : tables) {
-                    int colCount = table.getColCount();
-                    if (colCount == 4) {
-                        this.baseInfoTable = table;
-                    } else if (colCount >= 5) {
-                        this.extNavTables.add(table);
-                    }
-                }
+    protected void initTableInfo(List<Table> tables) {
+        for (Table table : tables) {
+            int colCount = table.getColCount();
+            if (colCount == 4) {
+                this.fundInfoTable = table;
+            } else if (colCount >= 5) {
+                this.extNavTables.add(table);
             }
         }
-        return reportName;
     }
 
     @Override
-    protected ReportFundInfoDTO parseBaseInfo(ReportParserParams params) {
-        Table baseInfoTable = this.baseInfoTable;
-        if (baseInfoTable == null) {
+    protected ReportFundInfoDTO parseFundInfo(ReportParserParams params) {
+        Table fundInfoTable = this.fundInfoTable;
+        if (fundInfoTable == null) {
             throw new APIException("未解析到基本信息表格");
         }
         Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
-        for (int i = 0; i < baseInfoTable.getRows().size(); i++) {
-            List<RectangularTextContainer> cols = baseInfoTable.getRows().get(i);
+        for (int i = 0; i < fundInfoTable.getRows().size(); i++) {
+            List<RectangularTextContainer> cols = fundInfoTable.getRows().get(i);
             for (int j = 0; j < 2; j++) {
                 baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
             }
@@ -100,9 +68,9 @@ public class PDMonthlyReportParser extends AbstractReportParser<MonthlyReportDat
     }
 
     @Override
-    protected MonthlyReportData parseExtInfoAndSetData(ReportBaseInfoDTO baseInfo, ReportFundInfoDTO fundInfo, ReportParserParams params) {
+    protected MonthlyReportData parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params) {
         MonthlyReportData reportData = new MonthlyReportData();
-        reportData.setBaseInfo(baseInfo);
+        reportData.setBaseInfo(this.buildReportInfo(params.getFileId(), reportName));
         reportData.setFundInfo(fundInfo);
 
         List<ReportNetReportDTO> exts = ListUtil.list(false);
@@ -131,7 +99,7 @@ public class PDMonthlyReportParser extends AbstractReportParser<MonthlyReportDat
                 fieldValue = null;
             }
             if (fieldValue != null) {
-                fieldValue = fieldValue.replace("\r", "");
+                fieldValue = fieldValue.replace("\r", Constants.EMPTY);
             }
             for (ValueLabelVO vo : this.fieldMapper) {
                 String fieldName = vo.getValue();
@@ -157,32 +125,4 @@ public class PDMonthlyReportParser extends AbstractReportParser<MonthlyReportDat
             }
         }
     }
-
-    private String matchReportName(String text) {
-        if (StrUtil.isBlank(text)) {
-            return null;
-        }
-        // 编译正则表达式模式
-        Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
-        Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
-        Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
-
-        // 创建Matcher对象
-        Matcher matcher1 = pat1.matcher(text);
-        Matcher matcher2 = pat2.matcher(text);
-        Matcher matcher3 = pat3.matcher(text);
-
-        // 尝试匹配
-        String reportName;
-        if (matcher1.find()) {
-            reportName = matcher1.group();
-        } else if (matcher2.find()) {
-            reportName = matcher2.group();
-        } else if (matcher3.find()) {
-            reportName = matcher3.group();
-        } else {
-            reportName = text;
-        }
-        return reportName.replace("(", "(").replace(")", ")");
-    }
 }

+ 7 - 7
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java

@@ -2,16 +2,16 @@ package com.simuwang.daq.components.report.parser.pdf;
 
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.QuarterlyReportData;
-import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
 import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
 import com.simuwang.base.pojo.dto.report.ReportParserParams;
 import com.simuwang.daq.components.report.parser.ReportParserConstant;
 import org.springframework.stereotype.Component;
+import technology.tabula.Table;
 
-import java.io.IOException;
+import java.util.List;
 
 @Component(ReportParserConstant.PARSER_PDF_QUARTERLY)
-public class PDQuarterlyReportParser extends AbstractReportParser<QuarterlyReportData> {
+public class PDQuarterlyReportParser extends AbstractPDReportParser<QuarterlyReportData> {
     public PDQuarterlyReportParser(EmailFieldMappingMapper fieldMappingMapper) {
         super(fieldMappingMapper);
     }
@@ -22,17 +22,17 @@ public class PDQuarterlyReportParser extends AbstractReportParser<QuarterlyRepor
     }
 
     @Override
-    protected String initAndGetReportName(ReportParserParams params) throws IOException {
-        return null;
+    protected void initTableInfo(List<Table> tables) {
+
     }
 
     @Override
-    protected ReportFundInfoDTO parseBaseInfo(ReportParserParams params) {
+    protected ReportFundInfoDTO parseFundInfo(ReportParserParams params) {
         return null;
     }
 
     @Override
-    protected QuarterlyReportData parseExtInfoAndSetData(ReportBaseInfoDTO baseInfo, ReportFundInfoDTO fundInfo, ReportParserParams params) {
+    protected QuarterlyReportData parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params) {
         return null;
     }
 }

+ 6 - 0
service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/AbstractPyReportParser.java

@@ -19,6 +19,11 @@ import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.util.Map;
 
+/**
+ * @author wangzaijun
+ * @date 2024/9/29 16:46
+ * @description python解析报告的抽象类
+ */
 public abstract class AbstractPyReportParser<T extends ReportData> implements ReportParser<T> {
     protected final Logger logger = LoggerFactory.getLogger(this.getClass());
 
@@ -32,6 +37,7 @@ public abstract class AbstractPyReportParser<T extends ReportData> implements Re
 
     @Override
     public T parse(ReportParserParams params) throws IOException {
+        this.init();
         Boolean enablePyParser = this.properties.getEnablePyParser();
         if (!enablePyParser) {
             this.logger.error("The python report parser is unavailable!");

+ 2 - 1
service-daq/src/main/java/com/simuwang/daq/service/ReportEmailParser.java

@@ -4,6 +4,7 @@ import cn.hutool.core.collection.ListUtil;
 import com.simuwang.base.common.conts.EmailTypeConst;
 import com.simuwang.base.pojo.dto.EmailContentInfoDTO;
 import com.simuwang.base.pojo.dto.EmailFundNavDTO;
+import com.simuwang.daq.components.report.parser.pdf.AbstractPDReportParser;
 import org.springframework.stereotype.Component;
 
 import java.util.List;
@@ -13,7 +14,7 @@ import java.util.Map;
  * @author wangzaijun
  * @date 2024/9/25 14:52
  * @description 报告的解析逻辑
- * @see com.simuwang.daq.components.report.parser.ReportParser,com.simuwang.daq.components.report.parser.py.AbstractPyReportParser,com.simuwang.daq.components.report.parser.pdf.AbstractReportParser
+ * @see com.simuwang.daq.components.report.parser.ReportParser,com.simuwang.daq.components.report.parser.py.AbstractPyReportParser, AbstractPDReportParser
  */
 @Component
 public class ReportEmailParser extends AbstractEmailParser {

+ 9 - 7
service-daq/src/main/java/com/simuwang/daq/utils/ReportParseUtil.java

@@ -7,10 +7,11 @@
 //import cn.hutool.http.HttpUtil;
 //import cn.hutool.json.JSONObject;
 //import cn.hutool.json.JSONUtil;
+//import com.simuwang.base.common.conts.Constants;
 //import com.simuwang.base.pojo.dto.report.PythonResult;
+//import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
 //import com.simuwang.daq.components.CustomPDFTextStripper;
 //import com.simuwang.daq.components.PythonReportConverter;
-//import com.simuwang.daq.dto.ReportFundInfo;
 //import com.smppw.common.pojo.ValueLabelVO;
 //import org.apache.pdfbox.Loader;
 //import org.apache.pdfbox.cos.COSName;
@@ -68,12 +69,12 @@
 //        fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
 //        fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
 //
-//        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
-//        List<String> watermarks = watermarkMap.get("less");
+////        Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
+////        List<String> watermarks = watermarkMap.get("less");
 //
 ////        System.out.println(watermarks);
 ////        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
-//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\2061834.pdf"))) {
+//        try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("C:\\Users\\Administrator\\Desktop\\self\\新报告解析\\基协报告\\季报\\SVP311_私募基金季报PDF_国恩回报6号增强私募证券投资基金_2024年06月30日.pdf"))) {
 ////            PDFTextStripper stripper = new PDFTextStripper();
 ////            stripper.setSortByPosition(true);
 ////            String allText = stripper.getText(document);
@@ -83,8 +84,9 @@
 //            PDFTextStripper textStripper = new CustomPDFTextStripper();
 //            textStripper.setSortByPosition(true);
 //            String text1 = textStripper.getText(document);
-//            text1 = text1.replace("+\r\n", "").replace("+","");
-//            List<String> textList = StrUtil.split(text1, "\r\n");
+//            text1 = text1.replace(Constants.WATERMARK_REPLACE, Constants.EMPTY);
+//            List<String> textList = StrUtil.split(text1, System.lineSeparator());
+//            textList.removeIf(StrUtil::isBlank);
 //            System.out.println(textList.get(0));
 //
 ////            for (PDPage page : document.getPages()) {
@@ -127,7 +129,7 @@
 //                                baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
 //                            }
 //                        }
-//                        ReportFundInfo reportFundInfo = new ReportFundInfo();
+//                        ReportFundInfoDTO reportFundInfo = new ReportFundInfoDTO();
 //                        baseInfoMap.forEach((k, v) -> {
 //                            for (ValueLabelVO vo : fieldMapper) {
 //                                String fieldName = vo.getValue();