9 ay önce · f74a336f74
--- a/service-base/src/main/java/com/simuwang/base/common/conts/Constants.java
+++ b/service-base/src/main/java/com/simuwang/base/common/conts/Constants.java
@@ -7,6 +7,8 @@ package com.simuwang.base.common.conts;
 
				  * @author ruoyi
			
 
				  */
			
 
				 public class Constants {
			
 
				+    public static final String WATERMARK_REPLACE = "+_+" + System.lineSeparator();
			
 
				+
			
 
				     public static final long DEFAULT_SERIAL_ID = 999L;
			
 
				 
			
 
				     /**
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java
@@ -3,24 +3,22 @@ package com.simuwang.daq.components;
 
				 import cn.hutool.core.collection.CollUtil;
			
 
				 import cn.hutool.core.collection.ListUtil;
			
 
				 import cn.hutool.core.util.StrUtil;
			
 
				-import org.apache.pdfbox.pdmodel.PDDocument;
			
 
				+import org.apache.pdfbox.text.PDFTextStripper;
			
 
				 import org.apache.pdfbox.text.TextPosition;
			
 
				-import technology.tabula.TextStripper;
			
 
				 
			
 
				 import java.io.IOException;
			
 
				 import java.util.List;
			
 
				 import java.util.stream.Collectors;
			
 
				 
			
 
				+import static com.simuwang.base.common.conts.Constants.WATERMARK_REPLACE;
			
 
				+
			
 
				 /**
			
 
				  * @author wangzaijun
			
 
				  * @date 2024/9/12 14:00
			
 
				- * @description 自定义的文本去水印方法，发现水印基本是旋转文字并且比报告内其他文字都大
			
 
				+ * @description 自定义的文本去水印方法，发现水印基本是旋转文字并且比报告内其他文字都大，区别于表格文字去水印的实现
			
 
				+ * @see CustomTabulaTextStripper
			
 
				  */
			
 
				-public class CustomPDFTextStripper extends TextStripper {
			
 
				-    public CustomPDFTextStripper(PDDocument document, int pageNumber) throws IOException {
			
 
				-        super(document, pageNumber);
			
 
				-    }
			
 
				-
			
 
				+public class CustomPDFTextStripper extends PDFTextStripper {
			
 
				     @Override
			
 
				     protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
			
 
				         // 水印文字基本都是有角度的，统计有旋转角度的文字宽度
			
@@ -33,7 +31,7 @@ public class CustomPDFTextStripper extends TextStripper {
 
				         }
			
 
				         // 如果全是水印文字则直接去除
			
 
				         if (textPositions.size() == weights.size()) {
			
 
				-            super.writeString(System.lineSeparator());
			
 
				+            super.writeString(WATERMARK_REPLACE);
			
 
				             return;
			
 
				         }
			
 
				         // 否则去除水印（文字没有旋转角度，并且水印字体大小没有包含当前文字时说明是正常文字；否则识别为水印并用特殊符号代替）
			
@@ -41,7 +39,7 @@ public class CustomPDFTextStripper extends TextStripper {
 
				         for (TextPosition textPosition : textPositions) {
			
 
				             float col = textPosition.getTextMatrix().getValue(0, 1);
			
 
				             float width = textPosition.getWidth();
			
 
				-            newTexts.add(col == 0. && !weights.contains(width) ? textPosition.getUnicode() : System.lineSeparator());
			
 
				+            newTexts.add(col == 0. && !weights.contains(width) ? textPosition.getUnicode() : WATERMARK_REPLACE);
			
 
				         }
			
 
				         super.writeString(String.join(StrUtil.EMPTY, newTexts));
			
 
				     }
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/CustomTabulaTextStripper.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/CustomTabulaTextStripper.java
@@ -0,0 +1,190 @@
 
				+package com.simuwang.daq.components;
			
 
				+
			
 
				+import org.apache.fontbox.util.BoundingBox;
			
 
				+import org.apache.pdfbox.pdmodel.PDDocument;
			
 
				+import org.apache.pdfbox.pdmodel.font.PDFont;
			
 
				+import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
			
 
				+import org.apache.pdfbox.pdmodel.font.PDType3Font;
			
 
				+import org.apache.pdfbox.text.TextPosition;
			
 
				+import technology.tabula.RectangleSpatialIndex;
			
 
				+import technology.tabula.TextElement;
			
 
				+import technology.tabula.TextStripper;
			
 
				+import technology.tabula.Utils;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+import java.util.stream.Collectors;
			
 
				+
			
 
				+/**
			
 
				+ * @author wangzaijun
			
 
				+ * @date 2024/9/12 14:00
			
 
				+ * @description 自定义的文本去水印方法，发现水印基本是旋转文字并且比报告内其他文字都大；主要依据文本旋转角度和字体大小判断是否为水印
			
 
				+ */
			
 
				+public class CustomTabulaTextStripper extends TextStripper {
			
 
				+    private static final String NBSP = "\u00A0";
			
 
				+    private static final float AVG_HEIGHT_MULT_THRESHOLD = 6.0f;
			
 
				+    private static final float MAX_BLANK_FONT_SIZE = 40.0f;
			
 
				+    private static final float MIN_BLANK_FONT_SIZE = 2.0f;
			
 
				+    private final PDDocument document;
			
 
				+    private final ArrayList<TextElement> textElements;
			
 
				+    private final RectangleSpatialIndex<TextElement> spatialIndex;
			
 
				+    private float minCharWidth = Float.MAX_VALUE;
			
 
				+    private float minCharHeight = Float.MAX_VALUE;
			
 
				+    private float totalHeight = 0.0f;
			
 
				+    private int countHeight = 0;
			
 
				+
			
 
				+    public CustomTabulaTextStripper(PDDocument document, int pageNumber) throws IOException {
			
 
				+        super(document, pageNumber);
			
 
				+        this.document = document;
			
 
				+        this.setStartPage(pageNumber);
			
 
				+        this.setEndPage(pageNumber);
			
 
				+        this.textElements = new ArrayList<>();
			
 
				+        this.spatialIndex = new RectangleSpatialIndex<>();
			
 
				+    }
			
 
				+
			
 
				+    public void process() throws IOException {
			
 
				+        this.getText(this.document);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected void writeString(String string, List<TextPosition> textPositions) {
			
 
				+        // 有旋转角度的文字
			
 
				+        List<TextPosition> rotationTexts = textPositions.stream()
			
 
				+                .filter(e -> e.getTextMatrix().getValue(0, 1) != 0.).collect(Collectors.toList());
			
 
				+        // 水印文字基本都是有角度的，统计有旋转角度的文字高度
			
 
				+        List<Float> heights = rotationTexts.stream().map(TextPosition::getHeight).collect(Collectors.toList());
			
 
				+        // 如果全是水印文字则直接去除
			
 
				+        if (textPositions.size() == heights.size()) {
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        // 其他场景需要写TextElement属性
			
 
				+        for (TextPosition textPosition : textPositions) {
			
 
				+            if (textPosition == null) {
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            String c = textPosition.getUnicode();
			
 
				+
			
 
				+            // if c not printable, return
			
 
				+            if (!isPrintable(c)) {
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            float h = textPosition.getHeightDir();
			
 
				+
			
 
				+            if (c.equals(NBSP)) { // replace non-breaking space for space
			
 
				+                c = " ";
			
 
				+            }
			
 
				+
			
 
				+            // 文字没有旋转角度，并且水印字体大小没有包含当前文字时说明是正常文字
			
 
				+            float rotation = textPosition.getTextMatrix().getValue(0, 1);
			
 
				+            if (rotation != 0. || heights.contains(h)) {
			
 
				+                c = " ";
			
 
				+            }
			
 
				+
			
 
				+            float wos = textPosition.getWidthOfSpace();
			
 
				+
			
 
				+            TextElement te = new TextElement(Utils.round(textPosition.getYDirAdj() - h, 2),
			
 
				+                    Utils.round(textPosition.getXDirAdj(), 2), Utils.round(textPosition.getWidthDirAdj(), 2),
			
 
				+                    Utils.round(textPosition.getHeightDir(), 2), textPosition.getFont(), textPosition.getFontSizeInPt(), c,
			
 
				+                    // workaround a possible bug in PDFBox:
			
 
				+                    // https://issues.apache.org/jira/browse/PDFBOX-1755
			
 
				+                    wos, textPosition.getDir());
			
 
				+
			
 
				+            this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
			
 
				+            this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());
			
 
				+
			
 
				+            countHeight++;
			
 
				+            totalHeight += te.getHeight();
			
 
				+            float avgHeight = totalHeight / countHeight;
			
 
				+
			
 
				+            //We have an issue where tall blank cells throw off the row height calculation
			
 
				+            //Introspect a blank cell a bit here to see if it should be thrown away
			
 
				+            if ((te.getText() == null || te.getText().trim().equals(""))) {
			
 
				+                //if the cell height is more than AVG_HEIGHT_MULT_THRESHOLDxaverage, throw it away
			
 
				+                if (avgHeight > 0
			
 
				+                        && te.getHeight() >= (avgHeight * AVG_HEIGHT_MULT_THRESHOLD)) {
			
 
				+                    continue;
			
 
				+                }
			
 
				+
			
 
				+                //if the font size is outside of reasonable ranges, throw it away
			
 
				+                if (textPosition.getFontSizeInPt() > MAX_BLANK_FONT_SIZE || textPosition.getFontSizeInPt() < MIN_BLANK_FONT_SIZE) {
			
 
				+                    continue;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            this.spatialIndex.add(te);
			
 
				+            this.textElements.add(te);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    protected float computeFontHeight(PDFont font) throws IOException {
			
 
				+        BoundingBox bbox = font.getBoundingBox();
			
 
				+        if (bbox.getLowerLeftY() < Short.MIN_VALUE) {
			
 
				+            // PDFBOX-2158 and PDFBOX-3130
			
 
				+            // files by Salmat eSolutions / ClibPDF Library
			
 
				+            bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
			
 
				+        }
			
 
				+        // 1/2 the bbox is used as the height todo: why?
			
 
				+        float glyphHeight = bbox.getHeight() / 2;
			
 
				+
			
 
				+        // sometimes the bbox has very high values, but CapHeight is OK
			
 
				+        PDFontDescriptor fontDescriptor = font.getFontDescriptor();
			
 
				+        if (fontDescriptor != null) {
			
 
				+            float capHeight = fontDescriptor.getCapHeight();
			
 
				+            if (Float.compare(capHeight, 0) != 0 &&
			
 
				+                    (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
			
 
				+                glyphHeight = capHeight;
			
 
				+            }
			
 
				+            // PDFBOX-3464, PDFBOX-448:
			
 
				+            // sometimes even CapHeight has very high value, but Ascent and Descent are ok
			
 
				+            float ascent = fontDescriptor.getAscent();
			
 
				+            float descent = fontDescriptor.getDescent();
			
 
				+            if (ascent > 0 && descent < 0 &&
			
 
				+                    ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
			
 
				+                glyphHeight = (ascent - descent) / 2;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // transformPoint from glyph space -> text space
			
 
				+        float height;
			
 
				+        if (font instanceof PDType3Font) {
			
 
				+            height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
			
 
				+        } else {
			
 
				+            height = glyphHeight / 1000;
			
 
				+        }
			
 
				+
			
 
				+        return height;
			
 
				+    }
			
 
				+
			
 
				+    private boolean isPrintable(String s) {
			
 
				+        char c;
			
 
				+        Character.UnicodeBlock block;
			
 
				+        boolean printable = false;
			
 
				+        for (int i = 0; i < s.length(); i++) {
			
 
				+            c = s.charAt(i);
			
 
				+            block = Character.UnicodeBlock.of(c);
			
 
				+            printable |= !Character.isISOControl(c) && block != null && block != Character.UnicodeBlock.SPECIALS;
			
 
				+        }
			
 
				+        return printable;
			
 
				+    }
			
 
				+
			
 
				+    public List<TextElement> getTextElements() {
			
 
				+        return this.textElements;
			
 
				+    }
			
 
				+
			
 
				+    public RectangleSpatialIndex<TextElement> getSpatialIndex() {
			
 
				+        return spatialIndex;
			
 
				+    }
			
 
				+
			
 
				+    public float getMinCharWidth() {
			
 
				+        return minCharWidth;
			
 
				+    }
			
 
				+
			
 
				+    public float getMinCharHeight() {
			
 
				+        return minCharHeight;
			
 
				+    }
			
 
				+}
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java
@@ -4,7 +4,6 @@ import cn.hutool.core.collection.CollUtil;
 
				 import cn.hutool.core.map.MapUtil;
			
 
				 import cn.hutool.core.util.ReflectUtil;
			
 
				 import cn.hutool.core.util.StrUtil;
			
 
				-import com.simuwang.base.common.conts.Constants;
			
 
				 import com.simuwang.base.mapper.EmailFieldMappingMapper;
			
 
				 import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
			
 
				 import com.simuwang.base.pojo.dto.report.ReportData;
			
@@ -14,6 +13,11 @@ import org.slf4j.LoggerFactory;
 
				 import java.util.List;
			
 
				 import java.util.Map;
			
 
				 
			
 
				+/**
			
 
				+ * @author wangzaijun
			
 
				+ * @date 2024/9/30 18:13
			
 
				+ * @description 非python接口的报告解析抽象（主要是支持pdf、word和excel等格式）
			
 
				+ */
			
 
				 public abstract class AbstractReportParser<T extends ReportData> implements ReportParser<T> {
			
 
				     protected final Logger logger = LoggerFactory.getLogger(this.getClass());
			
 
				 
			
@@ -28,8 +32,10 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
 
				         this.fieldMapper = MapUtil.newHashMap(128);
			
 
				     }
			
 
				 
			
 
				-    @Override
			
 
				-    public void init() {
			
 
				+    /**
			
 
				+     * 初始化数据的方法
			
 
				+     */
			
 
				+    protected void init() {
			
 
				         List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
			
 
				         if (CollUtil.isEmpty(emailFieldMapping)) {
			
 
				             this.logger.error("未设置报告解析规则！");
			
@@ -45,6 +51,13 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
 
				     }
			
 
				 
			
 
				     /**
			
 
				+     * 数据清洗，默认啥也不做
			
 
				+     *
			
 
				+     * @param reportData 结果数据
			
 
				+     */
			
 
				+    protected abstract void cleaningReportData(T reportData);
			
 
				+
			
 
				+    /**
			
 
				      * 对象字段设置
			
 
				      *
			
 
				      * @param extInfoMap 名称与值的对应关系
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParser.java
@@ -12,13 +12,6 @@ import java.io.IOException;
 
				  */
			
 
				 public interface ReportParser<T extends ReportData> {
			
 
				     /**
			
 
				-     * 初始化方法，该方法在执行前调用
			
 
				-     */
			
 
				-    default void init() {
			
 
				-
			
 
				-    }
			
 
				-
			
 
				-    /**
			
 
				      * 获取当前解析器名称
			
 
				      *
			
 
				      * @return /
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java
@@ -1,7 +1,9 @@
 
				 package com.simuwang.daq.components.report.parser.pdf;
			
 
				 
			
 
				 import cn.hutool.core.collection.CollUtil;
			
 
				+import cn.hutool.core.collection.ListUtil;
			
 
				 import cn.hutool.core.util.StrUtil;
			
 
				+import com.simuwang.base.common.conts.Constants;
			
 
				 import com.simuwang.base.common.exception.APIException;
			
 
				 import com.simuwang.base.mapper.EmailFieldMappingMapper;
			
 
				 import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
			
@@ -24,6 +26,7 @@ import java.util.Calendar;
 
				 import java.util.List;
			
 
				 import java.util.regex.Matcher;
			
 
				 import java.util.regex.Pattern;
			
 
				+import java.util.stream.Collectors;
			
 
				 
			
 
				 /**
			
 
				  * @author wangzaijun
			
@@ -31,6 +34,11 @@ import java.util.regex.Pattern;
 
				  * @description pdf格式的报告解析抽象类
			
 
				  */
			
 
				 public abstract class AbstractPDReportParser<T extends ReportData> extends AbstractReportParser<T> {
			
 
				+    /**
			
 
				+     * 去除了水印的所有文本内容
			
 
				+     */
			
 
				+    protected List<String> textList;
			
 
				+
			
 
				     public AbstractPDReportParser(EmailFieldMappingMapper fieldMappingMapper) {
			
 
				         super(fieldMappingMapper);
			
 
				     }
			
@@ -41,29 +49,35 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
				         // 解析报告名称和表格
			
 
				         String reportName = null;
			
 
				         try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
			
 
				-            CustomPDFTextStripper stripper = new CustomPDFTextStripper(document, 1);
			
 
				+            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
			
 
				             stripper.setSortByPosition(true);
			
 
				-            List<String> textList = StrUtil.split(stripper.getText(document), System.lineSeparator());
			
 
				-            textList.removeIf(StrUtil::isBlank);
			
 
				-            if (CollUtil.isNotEmpty(textList)) {
			
 
				-                reportName = this.matchReportName(textList.get(0));
			
 
				+            String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, System.lineSeparator());
			
 
				+            this.textList = StrUtil.split(text, System.lineSeparator());
			
 
				+            this.textList.removeIf(StrUtil::isBlank);
			
 
				+            if (CollUtil.isNotEmpty(this.textList)) {
			
 
				+                reportName = this.matchReportName(this.textList.get(0));
			
 
				                 if (StrUtil.isBlank(reportName)) {
			
 
				                     throw new APIException("未匹配到报告名称");
			
 
				                 }
			
 
				             }
			
 
				             // 解析所有表格
			
 
				+            List<Table> tables = ListUtil.list(true);
			
 
				             SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
			
 
				+            // 自定义表格提取工具，去除单元格中的水印文字
			
 
				             PageIterator pageIterator = new CustomObjectExtractor(document).extract();
			
 
				             while (pageIterator.hasNext()) {
			
 
				                 Page page = pageIterator.next();
			
 
				-                List<Table> tables = extractionAlgorithm.extract(page);
			
 
				-                this.initTableInfo(tables);
			
 
				+                tables.addAll(extractionAlgorithm.extract(page));
			
 
				             }
			
 
				+            this.initTableInfo(tables);
			
 
				         }
			
 
				         // 解析报告中主体基金的基本信息
			
 
				         ReportFundInfoDTO reportFundInfo = this.parseFundInfo(params);
			
 
				         // 解析其他表格信息并且设置结果字段
			
 
				-        return this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
			
 
				+        T reportData = this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
			
 
				+        // 数据清洗后返回
			
 
				+        this.cleaningReportData(reportData);
			
 
				+        return reportData;
			
 
				     }
			
 
				 
			
 
				     protected abstract void initTableInfo(List<Table> tables);
			
@@ -72,6 +86,11 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
				 
			
 
				     protected abstract T parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params);
			
 
				 
			
 
				+    @Override
			
 
				+    protected void cleaningReportData(T reportData) {
			
 
				+        // cleaning.
			
 
				+    }
			
 
				+
			
 
				     /**
			
 
				      * 构建报告基本信息
			
 
				      *
			
@@ -89,6 +108,33 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
				     }
			
 
				 
			
 
				     /**
			
 
				+     * 匹配分级基金名称
			
 
				+     *
			
 
				+     * @param text 文本内容
			
 
				+     * @return /
			
 
				+     */
			
 
				+    protected List<String> matchTieredFund(String text) {
			
 
				+        List<String> matches = ListUtil.list(false);
			
 
				+        if (StrUtil.isBlank(text)) {
			
 
				+            return matches;
			
 
				+        }
			
 
				+        // 使用正则表达式查找匹配项
			
 
				+        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
			
 
				+        Matcher matcher = pattern.matcher(text);
			
 
				+        // 收集所有匹配项
			
 
				+        while (matcher.find()) {
			
 
				+            matches.add(matcher.group());
			
 
				+        }
			
 
				+        // 提取字母并按字母顺序排序
			
 
				+        return matches.stream()
			
 
				+                .map(s -> s.replaceAll("[^A-F]", ""))
			
 
				+                .distinct()
			
 
				+                .sorted()
			
 
				+                .map(letter -> letter + "级")
			
 
				+                .collect(Collectors.toList());
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				      * 匹配报告名称
			
 
				      *
			
 
				      * @param text 文本内容
			
@@ -102,12 +148,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
				         Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
			
 
				         Pattern pat2 = Pattern.compile("私募.*披露年度报[告表](（\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}）)?");
			
 
				         Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
			
 
				-
			
 
				         // 创建Matcher对象
			
 
				         Matcher matcher1 = pat1.matcher(text);
			
 
				         Matcher matcher2 = pat2.matcher(text);
			
 
				         Matcher matcher3 = pat3.matcher(text);
			
 
				-
			
 
				         // 尝试匹配
			
 
				         String reportName;
			
 
				         if (matcher1.find()) {
			
@@ -132,19 +176,16 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
				         if (string == null) {
			
 
				             return null;
			
 
				         }
			
 
				-
			
 
				         // 编译正则表达式模式
			
 
				         Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
			
 
				         Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
			
 
				         Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
			
 
				         Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
			
 
				-
			
 
				         // 创建Matcher对象
			
 
				         Matcher matcher1 = pat1.matcher(string);
			
 
				         Matcher matcher2 = pat2.matcher(string);
			
 
				         Matcher matcher3 = pat3.matcher(string);
			
 
				         Matcher matcher4 = pat4.matcher(string);
			
 
				-
			
 
				         // 尝试匹配
			
 
				         if (matcher1.find()) {
			
 
				             String year = matcher1.group(1);
			
@@ -180,13 +221,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
				         if (string == null) {
			
 
				             return null;
			
 
				         }
			
 
				-
			
 
				         // 编译正则表达式模式
			
 
				         Pattern pattern = Pattern.compile("月|季度|年度");
			
 
				-
			
 
				         // 创建Matcher对象
			
 
				         Matcher matcher = pattern.matcher(string);
			
 
				-
			
 
				         // 尝试匹配
			
 
				         if (matcher.find()) {
			
 
				             return matcher.group();
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java
@@ -23,7 +23,9 @@ public class PDAnnuallyReportParser extends AbstractPDReportParser<AnnuallyRepor
 
				 
			
 
				     @Override
			
 
				     protected void initTableInfo(List<Table> tables) {
			
 
				-
			
 
				+        for (Table table : tables) {
			
 
				+            System.out.println(table.getColCount() + "," + table.getRowCount());
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     @Override
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDMonthlyReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDMonthlyReportParser.java
@@ -37,6 +37,7 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
 
				 
			
 
				     @Override
			
 
				     protected void initTableInfo(List<Table> tables) {
			
 
				+        // 一般月报是固定的模板，4列表格是基金基本信息，其他5列的表格是月净值
			
 
				         for (Table table : tables) {
			
 
				             int colCount = table.getColCount();
			
 
				             if (colCount == 4) {
			
@@ -70,10 +71,11 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
 
				 
			
 
				     @Override
			
 
				     protected MonthlyReportData parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params) {
			
 
				+        Integer fileId = params.getFileId();
			
 
				         MonthlyReportData reportData = new MonthlyReportData();
			
 
				-        reportData.setBaseInfo(this.buildReportInfo(params.getFileId(), reportName));
			
 
				+        reportData.setBaseInfo(this.buildReportInfo(fileId, reportName));
			
 
				         reportData.setFundInfo(fundInfo);
			
 
				-
			
 
				+        // 母基金和分级基金的净值
			
 
				         List<ReportNetReportDTO> exts = ListUtil.list(false);
			
 
				         List<Table> extNavTables = this.extNavTables;
			
 
				         for (Table extNavTable : extNavTables) {
			
@@ -84,11 +86,25 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
 
				                 extInfoMap.put(key, value);
			
 
				             }
			
 
				             ReportNetReportDTO navInfo = new ReportNetReportDTO();
			
 
				-            navInfo.setFileId(params.getFileId());
			
 
				-            buildInfo(extInfoMap, navInfo);
			
 
				+            navInfo.setFileId(fileId);
			
 
				+            this.buildInfo(extInfoMap, navInfo);
			
 
				             exts.add(navInfo);
			
 
				         }
			
 
				+        // 分级基金匹配
			
 
				+        List<String> levels = this.matchTieredFund(String.join(",", this.textList));
			
 
				+        levels.add(0, "母基金");
			
 
				+        for (int i = 0; i < exts.size(); i++) {
			
 
				+            if (levels.size() <= i) {
			
 
				+                continue;
			
 
				+            }
			
 
				+            exts.get(i).setLevel(levels.get(i));
			
 
				+        }
			
 
				         reportData.setNetReport(exts);
			
 
				         return reportData;
			
 
				     }
			
 
				+
			
 
				+    @Override
			
 
				+    protected void cleaningReportData(MonthlyReportData reportData) {
			
 
				+        // todo 数据清洗
			
 
				+    }
			
 
				 }
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java
@@ -28,7 +28,9 @@ public class PDQuarterlyReportParser extends AbstractPDReportParser<QuarterlyRep
 
				 
			
 
				     @Override
			
 
				     protected void initTableInfo(List<Table> tables) {
			
 
				-
			
 
				+        for (Table table : tables) {
			
 
				+            System.out.println(table.getColCount() + "," + table.getRowCount());
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     @Override
			
--- a/service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/AbstractPyReportParser.java
+++ b/service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/AbstractPyReportParser.java
@@ -38,7 +38,6 @@ public abstract class AbstractPyReportParser<T extends ReportData> implements Re
 
				 
			
 
				     @Override
			
 
				     public T parse(ReportParserParams params) throws IOException {
			
 
				-        this.init();
			
 
				         Boolean enablePyParser = this.properties.getEnablePyParser();
			
 
				         if (!enablePyParser) {
			
 
				             this.logger.error("The python report parser is unavailable!");
			
--- a/service-daq/src/main/java/com/simuwang/daq/service/EmailParseService.java
+++ b/service-daq/src/main/java/com/simuwang/daq/service/EmailParseService.java
@@ -832,8 +832,8 @@ public class EmailParseService {
 
				                     emailContentInfoDTOList.add(emailContentInfoDTO);
			
 
				                 }
			
 
				                 if (CollUtil.isNotEmpty(emailContentInfoDTOList)) {
			
 
				-                    // 估值表邮件不展示正文html文件
			
 
				-                    if (emailType.equals(EmailTypeConst.VALUATION_EMAIL_TYPE)) {
			
 
				+                    // 估值表或定期报告邮件不展示正文html文件
			
 
				+                    if (emailType.equals(EmailTypeConst.VALUATION_EMAIL_TYPE) || emailType.equals(EmailTypeConst.REPORT_EMAIL_TYPE)) {
			
 
				                         emailContentInfoDTOList = emailContentInfoDTOList.stream().filter(e -> !ExcelUtil.isHTML(e.getFilePath())).toList();
			
 
				                     }
			
 
				                     emailContentInfoDTOList.forEach(e -> {
			
--- a/service-daq/src/main/java/technology/tabula/CustomObjectExtractor.java
+++ b/service-daq/src/main/java/technology/tabula/CustomObjectExtractor.java
@@ -1,5 +1,6 @@
 
				 package technology.tabula;
			
 
				 
			
 
				+import com.simuwang.daq.components.CustomTabulaTextStripper;
			
 
				 import org.apache.pdfbox.pdmodel.PDDocument;
			
 
				 import org.apache.pdfbox.pdmodel.PDPage;
			
 
				 
			
@@ -7,8 +8,9 @@ import java.io.IOException;
 
				 
			
 
				 /**
			
 
				  * @author wangzaijun
			
 
				- * @date 2024/9/30 11:15
			
 
				- * @description 重写的
			
 
				+ * @date 2024/9/30 18:08
			
 
				+ * @description 自定义的pdf表格提取，重写的目的是为了让自定义的去水印的文本提起工具生效
			
 
				+ * @see CustomTabulaTextStripper
			
 
				  */
			
 
				 public class CustomObjectExtractor extends ObjectExtractor {
			
 
				     private final PDDocument pdfDocument;
			
@@ -28,7 +30,7 @@ public class CustomObjectExtractor extends ObjectExtractor {
 
				         ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
			
 
				         streamEngine.processPage(page);
			
 
				 
			
 
				-        TextStripper textStripper = new TextStripper(pdfDocument, pageNumber);
			
 
				+        CustomTabulaTextStripper textStripper = new CustomTabulaTextStripper(pdfDocument, pageNumber);
			
 
				         textStripper.process();
			
 
				 
			
 
				         Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER);
			
--- a/service-deploy/src/test/java/com/simuwang/ApplicationTest.java
+++ b/service-deploy/src/test/java/com/simuwang/ApplicationTest.java
@@ -45,8 +45,8 @@ public class ApplicationTest {
 
				     @Test
			
 
				     public void reportTest() {
			
 
				         MailboxInfoDTO emailInfoDTO = this.buildMailbox();
			
 
				-        Date startDate = DateUtil.parse("2024-09-30 10:50:00", DateConst.YYYY_MM_DD_HH_MM_SS);
			
 
				-        Date endDate = DateUtil.parse("2024-09-30 19:40:00", DateConst.YYYY_MM_DD_HH_MM_SS);
			
 
				+        Date startDate = DateUtil.parse("2024-09-30 08:59:30", DateConst.YYYY_MM_DD_HH_MM_SS);
			
 
				+        Date endDate = DateUtil.parse("2024-09-30 09:01:00", DateConst.YYYY_MM_DD_HH_MM_SS);
			
 
				         try {
			
 
				             emailParseService.parseEmail(emailInfoDTO, startDate, endDate);
			
 
				         } catch (Exception e) {