7 months ago · f74a336f74
--- a/service-base/src/main/java/com/simuwang/base/common/conts/Constants.java
+++ b/service-base/src/main/java/com/simuwang/base/common/conts/Constants.java
@@ -7,6 +7,8 @@ package com.simuwang.base.common.conts;
 
																  * @author ruoyi
															
 
																  */
															
 
																 public class Constants {
															
 
																+    public static final String WATERMARK_REPLACE = "+_+" + System.lineSeparator();
															
 
																+
															
 
																     public static final long DEFAULT_SERIAL_ID = 999L;
															
 
																     /**
															
--- a/service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java
@@ -3,24 +3,22 @@ package com.simuwang.daq.components;
 
																 import cn.hutool.core.collection.CollUtil;
															
 
																 import cn.hutool.core.collection.ListUtil;
															
 
																 import cn.hutool.core.util.StrUtil;
															
 
																-import org.apache.pdfbox.pdmodel.PDDocument;
															
 
																+import org.apache.pdfbox.text.PDFTextStripper;
															
 
																 import org.apache.pdfbox.text.TextPosition;
															
 
																-import technology.tabula.TextStripper;
															
 
																 import java.io.IOException;
															
 
																 import java.util.List;
															
 
																 import java.util.stream.Collectors;
															
 
																+import static com.simuwang.base.common.conts.Constants.WATERMARK_REPLACE;
															
 
																+
															
 
																 /**
															
 
																  * @author wangzaijun
															
 
																  * @date 2024/9/12 14:00
															
 
																- * @description 自定义的文本去水印方法，发现水印基本是旋转文字并且比报告内其他文字都大
															
 
																+ * @description 自定义的文本去水印方法，发现水印基本是旋转文字并且比报告内其他文字都大，区别于表格文字去水印的实现
															
 
																+ * @see CustomTabulaTextStripper
															
 
																  */
															
 
																-public class CustomPDFTextStripper extends TextStripper {
															
 
																-    public CustomPDFTextStripper(PDDocument document, int pageNumber) throws IOException {
															
 
																-        super(document, pageNumber);
															
 
																-    }
															
 
																-
															
 
																+public class CustomPDFTextStripper extends PDFTextStripper {
															
 
																     @Override
															
 
																     protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
															
 
																         // 水印文字基本都是有角度的，统计有旋转角度的文字宽度
															
@@ -33,7 +31,7 @@ public class CustomPDFTextStripper extends TextStripper {
 
																         }
															
 
																         // 如果全是水印文字则直接去除
															
 
																         if (textPositions.size() == weights.size()) {
															
 
																-            super.writeString(System.lineSeparator());
															
 
																+            super.writeString(WATERMARK_REPLACE);
															
 
																             return;
															
 
																         }
															
 
																         // 否则去除水印（文字没有旋转角度，并且水印字体大小没有包含当前文字时说明是正常文字；否则识别为水印并用特殊符号代替）
															
@@ -41,7 +39,7 @@ public class CustomPDFTextStripper extends TextStripper {
 
																         for (TextPosition textPosition : textPositions) {
															
 
																             float col = textPosition.getTextMatrix().getValue(0, 1);
															
 
																             float width = textPosition.getWidth();
															
 
																-            newTexts.add(col == 0. && !weights.contains(width) ? textPosition.getUnicode() : System.lineSeparator());
															
 
																+            newTexts.add(col == 0. && !weights.contains(width) ? textPosition.getUnicode() : WATERMARK_REPLACE);
															
 
																         }
															
 
																         super.writeString(String.join(StrUtil.EMPTY, newTexts));
															
 
																     }
															
--- a/service-daq/src/main/java/com/simuwang/daq/components/CustomTabulaTextStripper.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/CustomTabulaTextStripper.java
@@ -0,0 +1,190 @@
 
																+package com.simuwang.daq.components;
															
 
																+
															
 
																+import org.apache.fontbox.util.BoundingBox;
															
 
																+import org.apache.pdfbox.pdmodel.PDDocument;
															
 
																+import org.apache.pdfbox.pdmodel.font.PDFont;
															
 
																+import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
															
 
																+import org.apache.pdfbox.pdmodel.font.PDType3Font;
															
 
																+import org.apache.pdfbox.text.TextPosition;
															
 
																+import technology.tabula.RectangleSpatialIndex;
															
 
																+import technology.tabula.TextElement;
															
 
																+import technology.tabula.TextStripper;
															
 
																+import technology.tabula.Utils;
															
 
																+
															
 
																+import java.io.IOException;
															
 
																+import java.util.ArrayList;
															
 
																+import java.util.List;
															
 
																+import java.util.stream.Collectors;
															
 
																+
															
 
																+/**
															
 
																+ * @author wangzaijun
															
 
																+ * @date 2024/9/12 14:00
															
 
																+ * @description 自定义的文本去水印方法，发现水印基本是旋转文字并且比报告内其他文字都大；主要依据文本旋转角度和字体大小判断是否为水印
															
 
																+ */
															
 
																+public class CustomTabulaTextStripper extends TextStripper {
															
 
																+    private static final String NBSP = "\u00A0";
															
 
																+    private static final float AVG_HEIGHT_MULT_THRESHOLD = 6.0f;
															
 
																+    private static final float MAX_BLANK_FONT_SIZE = 40.0f;
															
 
																+    private static final float MIN_BLANK_FONT_SIZE = 2.0f;
															
 
																+    private final PDDocument document;
															
 
																+    private final ArrayList<TextElement> textElements;
															
 
																+    private final RectangleSpatialIndex<TextElement> spatialIndex;
															
 
																+    private float minCharWidth = Float.MAX_VALUE;
															
 
																+    private float minCharHeight = Float.MAX_VALUE;
															
 
																+    private float totalHeight = 0.0f;
															
 
																+    private int countHeight = 0;
															
 
																+
															
 
																+    public CustomTabulaTextStripper(PDDocument document, int pageNumber) throws IOException {
															
 
																+        super(document, pageNumber);
															
 
																+        this.document = document;
															
 
																+        this.setStartPage(pageNumber);
															
 
																+        this.setEndPage(pageNumber);
															
 
																+        this.textElements = new ArrayList<>();
															
 
																+        this.spatialIndex = new RectangleSpatialIndex<>();
															
 
																+    }
															
 
																+
															
 
																+    public void process() throws IOException {
															
 
																+        this.getText(this.document);
															
 
																+    }
															
 
																+
															
 
																+    @Override
															
 
																+    protected void writeString(String string, List<TextPosition> textPositions) {
															
 
																+        // 有旋转角度的文字
															
 
																+        List<TextPosition> rotationTexts = textPositions.stream()
															
 
																+                .filter(e -> e.getTextMatrix().getValue(0, 1) != 0.).collect(Collectors.toList());
															
 
																+        // 水印文字基本都是有角度的，统计有旋转角度的文字高度
															
 
																+        List<Float> heights = rotationTexts.stream().map(TextPosition::getHeight).collect(Collectors.toList());
															
 
																+        // 如果全是水印文字则直接去除
															
 
																+        if (textPositions.size() == heights.size()) {
															
 
																+            return;
															
 
																+        }
															
 
																+
															
 
																+        // 其他场景需要写TextElement属性
															
 
																+        for (TextPosition textPosition : textPositions) {
															
 
																+            if (textPosition == null) {
															
 
																+                continue;
															
 
																+            }
															
 
																+
															
 
																+            String c = textPosition.getUnicode();
															
 
																+
															
 
																+            // if c not printable, return
															
 
																+            if (!isPrintable(c)) {
															
 
																+                continue;
															
 
																+            }
															
 
																+
															
 
																+            float h = textPosition.getHeightDir();
															
 
																+
															
 
																+            if (c.equals(NBSP)) { // replace non-breaking space for space
															
 
																+                c = " ";
															
 
																+            }
															
 
																+
															
 
																+            // 文字没有旋转角度，并且水印字体大小没有包含当前文字时说明是正常文字
															
 
																+            float rotation = textPosition.getTextMatrix().getValue(0, 1);
															
 
																+            if (rotation != 0. || heights.contains(h)) {
															
 
																+                c = " ";
															
 
																+            }
															
 
																+
															
 
																+            float wos = textPosition.getWidthOfSpace();
															
 
																+
															
 
																+            TextElement te = new TextElement(Utils.round(textPosition.getYDirAdj() - h, 2),
															
 
																+                    Utils.round(textPosition.getXDirAdj(), 2), Utils.round(textPosition.getWidthDirAdj(), 2),
															
 
																+                    Utils.round(textPosition.getHeightDir(), 2), textPosition.getFont(), textPosition.getFontSizeInPt(), c,
															
 
																+                    // workaround a possible bug in PDFBox:
															
 
																+                    // https://issues.apache.org/jira/browse/PDFBOX-1755
															
 
																+                    wos, textPosition.getDir());
															
 
																+
															
 
																+            this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
															
 
																+            this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());
															
 
																+
															
 
																+            countHeight++;
															
 
																+            totalHeight += te.getHeight();
															
 
																+            float avgHeight = totalHeight / countHeight;
															
 
																+
															
 
																+            //We have an issue where tall blank cells throw off the row height calculation
															
 
																+            //Introspect a blank cell a bit here to see if it should be thrown away
															
 
																+            if ((te.getText() == null || te.getText().trim().equals(""))) {
															
 
																+                //if the cell height is more than AVG_HEIGHT_MULT_THRESHOLDxaverage, throw it away
															
 
																+                if (avgHeight > 0
															
 
																+                        && te.getHeight() >= (avgHeight * AVG_HEIGHT_MULT_THRESHOLD)) {
															
 
																+                    continue;
															
 
																+                }
															
 
																+
															
 
																+                //if the font size is outside of reasonable ranges, throw it away
															
 
																+                if (textPosition.getFontSizeInPt() > MAX_BLANK_FONT_SIZE || textPosition.getFontSizeInPt() < MIN_BLANK_FONT_SIZE) {
															
 
																+                    continue;
															
 
																+                }
															
 
																+            }
															
 
																+
															
 
																+            this.spatialIndex.add(te);
															
 
																+            this.textElements.add(te);
															
 
																+        }
															
 
																+    }
															
 
																+
															
 
																+    @Override
															
 
																+    protected float computeFontHeight(PDFont font) throws IOException {
															
 
																+        BoundingBox bbox = font.getBoundingBox();
															
 
																+        if (bbox.getLowerLeftY() < Short.MIN_VALUE) {
															
 
																+            // PDFBOX-2158 and PDFBOX-3130
															
 
																+            // files by Salmat eSolutions / ClibPDF Library
															
 
																+            bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
															
 
																+        }
															
 
																+        // 1/2 the bbox is used as the height todo: why?
															
 
																+        float glyphHeight = bbox.getHeight() / 2;
															
 
																+
															
 
																+        // sometimes the bbox has very high values, but CapHeight is OK
															
 
																+        PDFontDescriptor fontDescriptor = font.getFontDescriptor();
															
 
																+        if (fontDescriptor != null) {
															
 
																+            float capHeight = fontDescriptor.getCapHeight();
															
 
																+            if (Float.compare(capHeight, 0) != 0 &&
															
 
																+                    (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
															
 
																+                glyphHeight = capHeight;
															
 
																+            }
															
 
																+            // PDFBOX-3464, PDFBOX-448:
															
 
																+            // sometimes even CapHeight has very high value, but Ascent and Descent are ok
															
 
																+            float ascent = fontDescriptor.getAscent();
															
 
																+            float descent = fontDescriptor.getDescent();
															
 
																+            if (ascent > 0 && descent < 0 &&
															
 
																+                    ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
															
 
																+                glyphHeight = (ascent - descent) / 2;
															
 
																+            }
															
 
																+        }
															
 
																+
															
 
																+        // transformPoint from glyph space -> text space
															
 
																+        float height;
															
 
																+        if (font instanceof PDType3Font) {
															
 
																+            height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
															
 
																+        } else {
															
 
																+            height = glyphHeight / 1000;
															
 
																+        }
															
 
																+
															
 
																+        return height;
															
 
																+    }
															
 
																+
															
 
																+    private boolean isPrintable(String s) {
															
 
																+        char c;
															
 
																+        Character.UnicodeBlock block;
															
 
																+        boolean printable = false;
															
 
																+        for (int i = 0; i < s.length(); i++) {
															
 
																+            c = s.charAt(i);
															
 
																+            block = Character.UnicodeBlock.of(c);
															
 
																+            printable |= !Character.isISOControl(c) && block != null && block != Character.UnicodeBlock.SPECIALS;
															
 
																+        }
															
 
																+        return printable;
															
 
																+    }
															
 
																+
															
 
																+    public List<TextElement> getTextElements() {
															
 
																+        return this.textElements;
															
 
																+    }
															
 
																+
															
 
																+    public RectangleSpatialIndex<TextElement> getSpatialIndex() {
															
 
																+        return spatialIndex;
															
 
																+    }
															
 
																+
															
 
																+    public float getMinCharWidth() {
															
 
																+        return minCharWidth;
															
 
																+    }
															
 
																+
															
 
																+    public float getMinCharHeight() {
															
 
																+        return minCharHeight;
															
 
																+    }
															
 
																+}
															
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java
@@ -4,7 +4,6 @@ import cn.hutool.core.collection.CollUtil;
 
																 import cn.hutool.core.map.MapUtil;
															
 
																 import cn.hutool.core.util.ReflectUtil;
															
 
																 import cn.hutool.core.util.StrUtil;
															
 
																-import com.simuwang.base.common.conts.Constants;
															
 
																 import com.simuwang.base.mapper.EmailFieldMappingMapper;
															
 
																 import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
															
 
																 import com.simuwang.base.pojo.dto.report.ReportData;
															
@@ -14,6 +13,11 @@ import org.slf4j.LoggerFactory;
 
																 import java.util.List;
															
 
																 import java.util.Map;
															
 
																+/**
															
 
																+ * @author wangzaijun
															
 
																+ * @date 2024/9/30 18:13
															
 
																+ * @description 非python接口的报告解析抽象（主要是支持pdf、word和excel等格式）
															
 
																+ */
															
 
																 public abstract class AbstractReportParser<T extends ReportData> implements ReportParser<T> {
															
 
																     protected final Logger logger = LoggerFactory.getLogger(this.getClass());
															
@@ -28,8 +32,10 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
 
																         this.fieldMapper = MapUtil.newHashMap(128);
															
 
																     }
															
 
																-    @Override
															
 
																-    public void init() {
															
 
																+    /**
															
 
																+     * 初始化数据的方法
															
 
																+     */
															
 
																+    protected void init() {
															
 
																         List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
															
 
																         if (CollUtil.isEmpty(emailFieldMapping)) {
															
 
																             this.logger.error("未设置报告解析规则！");
															
@@ -45,6 +51,13 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
 
																     }
															
 
																     /**
															
 
																+     * 数据清洗，默认啥也不做
															
 
																+     *
															
 
																+     * @param reportData 结果数据
															
 
																+     */
															
 
																+    protected abstract void cleaningReportData(T reportData);
															
 
																+
															
 
																+    /**
															
 
																      * 对象字段设置
															
 
																      *
															
 
																      * @param extInfoMap 名称与值的对应关系
															
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParser.java
@@ -12,13 +12,6 @@ import java.io.IOException;
 
																  */
															
 
																 public interface ReportParser<T extends ReportData> {
															
 
																     /**
															
 
																-     * 初始化方法，该方法在执行前调用
															
 
																-     */
															
 
																-    default void init() {
															
 
																-
															
 
																-    }
															
 
																-
															
 
																-    /**
															
 
																      * 获取当前解析器名称
															
 
																      *
															
 
																      * @return /
															
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java
@@ -1,7 +1,9 @@
 
																 package com.simuwang.daq.components.report.parser.pdf;
															
 
																 import cn.hutool.core.collection.CollUtil;
															
 
																+import cn.hutool.core.collection.ListUtil;
															
 
																 import cn.hutool.core.util.StrUtil;
															
 
																+import com.simuwang.base.common.conts.Constants;
															
 
																 import com.simuwang.base.common.exception.APIException;
															
 
																 import com.simuwang.base.mapper.EmailFieldMappingMapper;
															
 
																 import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
															
@@ -24,6 +26,7 @@ import java.util.Calendar;
 
																 import java.util.List;
															
 
																 import java.util.regex.Matcher;
															
 
																 import java.util.regex.Pattern;
															
 
																+import java.util.stream.Collectors;
															
 
																 /**
															
 
																  * @author wangzaijun
															
@@ -31,6 +34,11 @@ import java.util.regex.Pattern;
 
																  * @description pdf格式的报告解析抽象类
															
 
																  */
															
 
																 public abstract class AbstractPDReportParser<T extends ReportData> extends AbstractReportParser<T> {
															
 
																+    /**
															
 
																+     * 去除了水印的所有文本内容
															
 
																+     */
															
 
																+    protected List<String> textList;
															
 
																+
															
 
																     public AbstractPDReportParser(EmailFieldMappingMapper fieldMappingMapper) {
															
 
																         super(fieldMappingMapper);
															
 
																     }
															
@@ -41,29 +49,35 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
																         // 解析报告名称和表格
															
 
																         String reportName = null;
															
 
																         try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
															
 
																-            CustomPDFTextStripper stripper = new CustomPDFTextStripper(document, 1);
															
 
																+            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
															
 
																             stripper.setSortByPosition(true);
															
 
																-            List<String> textList = StrUtil.split(stripper.getText(document), System.lineSeparator());
															
 
																-            textList.removeIf(StrUtil::isBlank);
															
 
																-            if (CollUtil.isNotEmpty(textList)) {
															
 
																-                reportName = this.matchReportName(textList.get(0));
															
 
																+            String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, System.lineSeparator());
															
 
																+            this.textList = StrUtil.split(text, System.lineSeparator());
															
 
																+            this.textList.removeIf(StrUtil::isBlank);
															
 
																+            if (CollUtil.isNotEmpty(this.textList)) {
															
 
																+                reportName = this.matchReportName(this.textList.get(0));
															
 
																                 if (StrUtil.isBlank(reportName)) {
															
 
																                     throw new APIException("未匹配到报告名称");
															
 
																                 }
															
 
																             }
															
 
																             // 解析所有表格
															
 
																+            List<Table> tables = ListUtil.list(true);
															
 
																             SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
															
 
																+            // 自定义表格提取工具，去除单元格中的水印文字
															
 
																             PageIterator pageIterator = new CustomObjectExtractor(document).extract();
															
 
																             while (pageIterator.hasNext()) {
															
 
																                 Page page = pageIterator.next();
															
 
																-                List<Table> tables = extractionAlgorithm.extract(page);
															
 
																-                this.initTableInfo(tables);
															
 
																+                tables.addAll(extractionAlgorithm.extract(page));
															
 
																             }
															
 
																+            this.initTableInfo(tables);
															
 
																         }
															
 
																         // 解析报告中主体基金的基本信息
															
 
																         ReportFundInfoDTO reportFundInfo = this.parseFundInfo(params);
															
 
																         // 解析其他表格信息并且设置结果字段
															
 
																-        return this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
															
 
																+        T reportData = this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
															
 
																+        // 数据清洗后返回
															
 
																+        this.cleaningReportData(reportData);
															
 
																+        return reportData;
															
 
																     }
															
 
																     protected abstract void initTableInfo(List<Table> tables);
															
@@ -72,6 +86,11 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
																     protected abstract T parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params);
															
 
																+    @Override
															
 
																+    protected void cleaningReportData(T reportData) {
															
 
																+        // cleaning.
															
 
																+    }
															
 
																+
															
 
																     /**
															
 
																      * 构建报告基本信息
															
 
																      *
															
@@ -89,6 +108,33 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
																     }
															
 
																     /**
															
 
																+     * 匹配分级基金名称
															
 
																+     *
															
 
																+     * @param text 文本内容
															
 
																+     * @return /
															
 
																+     */
															
 
																+    protected List<String> matchTieredFund(String text) {
															
 
																+        List<String> matches = ListUtil.list(false);
															
 
																+        if (StrUtil.isBlank(text)) {
															
 
																+            return matches;
															
 
																+        }
															
 
																+        // 使用正则表达式查找匹配项
															
 
																+        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
															
 
																+        Matcher matcher = pattern.matcher(text);
															
 
																+        // 收集所有匹配项
															
 
																+        while (matcher.find()) {
															
 
																+            matches.add(matcher.group());
															
 
																+        }
															
 
																+        // 提取字母并按字母顺序排序
															
 
																+        return matches.stream()
															
 
																+                .map(s -> s.replaceAll("[^A-F]", ""))
															
 
																+                .distinct()
															
 
																+                .sorted()
															
 
																+                .map(letter -> letter + "级")
															
 
																+                .collect(Collectors.toList());
															
 
																+    }
															
 
																+
															
 
																+    /**
															
 
																      * 匹配报告名称
															
 
																      *
															
 
																      * @param text 文本内容
															
@@ -102,12 +148,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
																         Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
															
 
																         Pattern pat2 = Pattern.compile("私募.*披露年度报[告表](（\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}）)?");
															
 
																         Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
															
 
																-
															
 
																         // 创建Matcher对象
															
 
																         Matcher matcher1 = pat1.matcher(text);
															
 
																         Matcher matcher2 = pat2.matcher(text);
															
 
																         Matcher matcher3 = pat3.matcher(text);
															
 
																-
															
 
																         // 尝试匹配
															
 
																         String reportName;
															
 
																         if (matcher1.find()) {
															
@@ -132,19 +176,16 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
																         if (string == null) {
															
 
																             return null;
															
 
																         }
															
 
																-
															
 
																         // 编译正则表达式模式
															
 
																         Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
															
 
																         Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
															
 
																         Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
															
 
																         Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
															
 
																-
															
 
																         // 创建Matcher对象
															
 
																         Matcher matcher1 = pat1.matcher(string);
															
 
																         Matcher matcher2 = pat2.matcher(string);
															
 
																         Matcher matcher3 = pat3.matcher(string);
															
 
																         Matcher matcher4 = pat4.matcher(string);
															
 
																-
															
 
																         // 尝试匹配
															
 
																         if (matcher1.find()) {
															
 
																             String year = matcher1.group(1);
															
@@ -180,13 +221,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
																         if (string == null) {
															
 
																             return null;
															
 
																         }
															
 
																-
															
 
																         // 编译正则表达式模式
															
 
																         Pattern pattern = Pattern.compile("月|季度|年度");
															
 
																-
															
 
																         // 创建Matcher对象
															
 
																         Matcher matcher = pattern.matcher(string);
															
 
																-
															
 
																         // 尝试匹配
															
 
																         if (matcher.find()) {
															
 
																             return matcher.group();
															
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java
@@ -23,7 +23,9 @@ public class PDAnnuallyReportParser extends AbstractPDReportParser<AnnuallyRepor
 
																     @Override
															
 
																     protected void initTableInfo(List<Table> tables) {
															
 
																-
															
 
																+        for (Table table : tables) {
															
 
																+            System.out.println(table.getColCount() + "," + table.getRowCount());
															
 
																+        }
															
 
																     }
															
 
																     @Override
															
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDMonthlyReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDMonthlyReportParser.java
@@ -37,6 +37,7 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
 
																     @Override
															
 
																     protected void initTableInfo(List<Table> tables) {
															
 
																+        // 一般月报是固定的模板，4列表格是基金基本信息，其他5列的表格是月净值
															
 
																         for (Table table : tables) {
															
 
																             int colCount = table.getColCount();
															
 
																             if (colCount == 4) {
															
@@ -70,10 +71,11 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
 
																     @Override
															
 
																     protected MonthlyReportData parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params) {
															
 
																+        Integer fileId = params.getFileId();
															
 
																         MonthlyReportData reportData = new MonthlyReportData();
															
 
																-        reportData.setBaseInfo(this.buildReportInfo(params.getFileId(), reportName));
															
 
																+        reportData.setBaseInfo(this.buildReportInfo(fileId, reportName));
															
 
																         reportData.setFundInfo(fundInfo);
															
 
																-
															
 
																+        // 母基金和分级基金的净值
															
 
																         List<ReportNetReportDTO> exts = ListUtil.list(false);
															
 
																         List<Table> extNavTables = this.extNavTables;
															
 
																         for (Table extNavTable : extNavTables) {
															
@@ -84,11 +86,25 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
 
																                 extInfoMap.put(key, value);
															
 
																             }
															
 
																             ReportNetReportDTO navInfo = new ReportNetReportDTO();
															
 
																-            navInfo.setFileId(params.getFileId());
															
 
																-            buildInfo(extInfoMap, navInfo);
															
 
																+            navInfo.setFileId(fileId);
															
 
																+            this.buildInfo(extInfoMap, navInfo);
															
 
																             exts.add(navInfo);
															
 
																         }
															
 
																+        // 分级基金匹配
															
 
																+        List<String> levels = this.matchTieredFund(String.join(",", this.textList));
															
 
																+        levels.add(0, "母基金");
															
 
																+        for (int i = 0; i < exts.size(); i++) {
															
 
																+            if (levels.size() <= i) {
															
 
																+                continue;
															
 
																+            }
															
 
																+            exts.get(i).setLevel(levels.get(i));
															
 
																+        }
															
 
																         reportData.setNetReport(exts);
															
 
																         return reportData;
															
 
																     }
															
 
																+
															
 
																+    @Override
															
 
																+    protected void cleaningReportData(MonthlyReportData reportData) {
															
 
																+        // todo 数据清洗
															
 
																+    }
															
 
																 }
															
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java
@@ -28,7 +28,9 @@ public class PDQuarterlyReportParser extends AbstractPDReportParser<QuarterlyRep
 
																     @Override
															
 
																     protected void initTableInfo(List<Table> tables) {
															
 
																-
															
 
																+        for (Table table : tables) {
															
 
																+            System.out.println(table.getColCount() + "," + table.getRowCount());
															
 
																+        }
															
 
																     }
															
 
																     @Override
															
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/AbstractPyReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/AbstractPyReportParser.java
@@ -38,7 +38,6 @@ public abstract class AbstractPyReportParser<T extends ReportData> implements Re
 
																     @Override
															
 
																     public T parse(ReportParserParams params) throws IOException {
															
 
																-        this.init();
															
 
																         Boolean enablePyParser = this.properties.getEnablePyParser();
															
 
																         if (!enablePyParser) {
															
 
																             this.logger.error("The python report parser is unavailable!");
															
--- a/service-daq/src/main/java/com/simuwang/daq/service/EmailParseService.java
+++ b/service-daq/src/main/java/com/simuwang/daq/service/EmailParseService.java
@@ -832,8 +832,8 @@ public class EmailParseService {
 
																                     emailContentInfoDTOList.add(emailContentInfoDTO);
															
 
																                 }
															
 
																                 if (CollUtil.isNotEmpty(emailContentInfoDTOList)) {
															
 
																-                    // 估值表邮件不展示正文html文件
															
 
																-                    if (emailType.equals(EmailTypeConst.VALUATION_EMAIL_TYPE)) {
															
 
																+                    // 估值表或定期报告邮件不展示正文html文件
															
 
																+                    if (emailType.equals(EmailTypeConst.VALUATION_EMAIL_TYPE) || emailType.equals(EmailTypeConst.REPORT_EMAIL_TYPE)) {
															
 
																                         emailContentInfoDTOList = emailContentInfoDTOList.stream().filter(e -> !ExcelUtil.isHTML(e.getFilePath())).toList();
															
 
																                     }
															
 
																                     emailContentInfoDTOList.forEach(e -> {
															
--- a/service-daq/src/main/java/technology/tabula/CustomObjectExtractor.java
+++ b/service-daq/src/main/java/technology/tabula/CustomObjectExtractor.java
@@ -1,5 +1,6 @@
 
																 package technology.tabula;
															
 
																+import com.simuwang.daq.components.CustomTabulaTextStripper;
															
 
																 import org.apache.pdfbox.pdmodel.PDDocument;
															
 
																 import org.apache.pdfbox.pdmodel.PDPage;
															
@@ -7,8 +8,9 @@ import java.io.IOException;
 
																 /**
															
 
																  * @author wangzaijun
															
 
																- * @date 2024/9/30 11:15
															
 
																- * @description 重写的
															
 
																+ * @date 2024/9/30 18:08
															
 
																+ * @description 自定义的pdf表格提取，重写的目的是为了让自定义的去水印的文本提起工具生效
															
 
																+ * @see CustomTabulaTextStripper
															
 
																  */
															
 
																 public class CustomObjectExtractor extends ObjectExtractor {
															
 
																     private final PDDocument pdfDocument;
															
@@ -28,7 +30,7 @@ public class CustomObjectExtractor extends ObjectExtractor {
 
																         ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
															
 
																         streamEngine.processPage(page);
															
 
																-        TextStripper textStripper = new TextStripper(pdfDocument, pageNumber);
															
 
																+        CustomTabulaTextStripper textStripper = new CustomTabulaTextStripper(pdfDocument, pageNumber);
															
 
																         textStripper.process();
															
 
																         Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER);
															
--- a/service-deploy/src/test/java/com/simuwang/ApplicationTest.java
+++ b/service-deploy/src/test/java/com/simuwang/ApplicationTest.java
@@ -45,8 +45,8 @@ public class ApplicationTest {
 
																     @Test
															
 
																     public void reportTest() {
															
 
																         MailboxInfoDTO emailInfoDTO = this.buildMailbox();
															
 
																-        Date startDate = DateUtil.parse("2024-09-30 10:50:00", DateConst.YYYY_MM_DD_HH_MM_SS);
															
 
																-        Date endDate = DateUtil.parse("2024-09-30 19:40:00", DateConst.YYYY_MM_DD_HH_MM_SS);
															
 
																+        Date startDate = DateUtil.parse("2024-09-30 08:59:30", DateConst.YYYY_MM_DD_HH_MM_SS);
															
 
																+        Date endDate = DateUtil.parse("2024-09-30 09:01:00", DateConst.YYYY_MM_DD_HH_MM_SS);
															
 
																         try {
															
 
																             emailParseService.parseEmail(emailInfoDTO, startDate, endDate);
															
 
																         } catch (Exception e) {