Kaynağa Gözat

feat:pdf提取的表格中去水印的自定义实现完成

wangzaijun 7 ay önce
ebeveyn
işleme
f74a336f74

+ 2 - 0
service-base/src/main/java/com/simuwang/base/common/conts/Constants.java

@@ -7,6 +7,8 @@ package com.simuwang.base.common.conts;
  * @author ruoyi
  */
 public class Constants {
+    public static final String WATERMARK_REPLACE = "+_+" + System.lineSeparator();
+
     public static final long DEFAULT_SERIAL_ID = 999L;
 
     /**

+ 8 - 10
service-daq/src/main/java/com/simuwang/daq/components/CustomPDFTextStripper.java

@@ -3,24 +3,22 @@ package com.simuwang.daq.components;
 import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.util.StrUtil;
-import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
-import technology.tabula.TextStripper;
 
 import java.io.IOException;
 import java.util.List;
 import java.util.stream.Collectors;
 
+import static com.simuwang.base.common.conts.Constants.WATERMARK_REPLACE;
+
 /**
  * @author wangzaijun
  * @date 2024/9/12 14:00
- * @description 自定义的文本去水印方法,发现水印基本是旋转文字并且比报告内其他文字都大
+ * @description 自定义的文本去水印方法,发现水印基本是旋转文字并且比报告内其他文字都大,区别于表格文字去水印的实现
+ * @see CustomTabulaTextStripper
  */
-public class CustomPDFTextStripper extends TextStripper {
-    public CustomPDFTextStripper(PDDocument document, int pageNumber) throws IOException {
-        super(document, pageNumber);
-    }
-
+public class CustomPDFTextStripper extends PDFTextStripper {
     @Override
     protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
         // 水印文字基本都是有角度的,统计有旋转角度的文字宽度
@@ -33,7 +31,7 @@ public class CustomPDFTextStripper extends TextStripper {
         }
         // 如果全是水印文字则直接去除
         if (textPositions.size() == weights.size()) {
-            super.writeString(System.lineSeparator());
+            super.writeString(WATERMARK_REPLACE);
             return;
         }
         // 否则去除水印(文字没有旋转角度,并且水印字体大小没有包含当前文字时说明是正常文字;否则识别为水印并用特殊符号代替)
@@ -41,7 +39,7 @@ public class CustomPDFTextStripper extends TextStripper {
         for (TextPosition textPosition : textPositions) {
             float col = textPosition.getTextMatrix().getValue(0, 1);
             float width = textPosition.getWidth();
-            newTexts.add(col == 0. && !weights.contains(width) ? textPosition.getUnicode() : System.lineSeparator());
+            newTexts.add(col == 0. && !weights.contains(width) ? textPosition.getUnicode() : WATERMARK_REPLACE);
         }
         super.writeString(String.join(StrUtil.EMPTY, newTexts));
     }

+ 190 - 0
service-daq/src/main/java/com/simuwang/daq/components/CustomTabulaTextStripper.java

@@ -0,0 +1,190 @@
+package com.simuwang.daq.components;
+
+import org.apache.fontbox.util.BoundingBox;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
+import org.apache.pdfbox.pdmodel.font.PDType3Font;
+import org.apache.pdfbox.text.TextPosition;
+import technology.tabula.RectangleSpatialIndex;
+import technology.tabula.TextElement;
+import technology.tabula.TextStripper;
+import technology.tabula.Utils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * @author wangzaijun
+ * @date 2024/9/12 14:00
+ * @description 自定义的文本去水印方法,发现水印基本是旋转文字并且比报告内其他文字都大;主要依据文本旋转角度和字体大小判断是否为水印
+ */
+public class CustomTabulaTextStripper extends TextStripper {
+    private static final String NBSP = "\u00A0";
+    private static final float AVG_HEIGHT_MULT_THRESHOLD = 6.0f;
+    private static final float MAX_BLANK_FONT_SIZE = 40.0f;
+    private static final float MIN_BLANK_FONT_SIZE = 2.0f;
+    private final PDDocument document;
+    private final ArrayList<TextElement> textElements;
+    private final RectangleSpatialIndex<TextElement> spatialIndex;
+    private float minCharWidth = Float.MAX_VALUE;
+    private float minCharHeight = Float.MAX_VALUE;
+    private float totalHeight = 0.0f;
+    private int countHeight = 0;
+
+    public CustomTabulaTextStripper(PDDocument document, int pageNumber) throws IOException {
+        super(document, pageNumber);
+        this.document = document;
+        this.setStartPage(pageNumber);
+        this.setEndPage(pageNumber);
+        this.textElements = new ArrayList<>();
+        this.spatialIndex = new RectangleSpatialIndex<>();
+    }
+
+    public void process() throws IOException {
+        this.getText(this.document);
+    }
+
+    @Override
+    protected void writeString(String string, List<TextPosition> textPositions) {
+        // 有旋转角度的文字
+        List<TextPosition> rotationTexts = textPositions.stream()
+                .filter(e -> e.getTextMatrix().getValue(0, 1) != 0.).collect(Collectors.toList());
+        // 水印文字基本都是有角度的,统计有旋转角度的文字高度
+        List<Float> heights = rotationTexts.stream().map(TextPosition::getHeight).collect(Collectors.toList());
+        // 如果全是水印文字则直接去除
+        if (textPositions.size() == heights.size()) {
+            return;
+        }
+
+        // 其他场景需要写TextElement属性
+        for (TextPosition textPosition : textPositions) {
+            if (textPosition == null) {
+                continue;
+            }
+
+            String c = textPosition.getUnicode();
+
+            // if c not printable, return
+            if (!isPrintable(c)) {
+                continue;
+            }
+
+            float h = textPosition.getHeightDir();
+
+            if (c.equals(NBSP)) { // replace non-breaking space for space
+                c = " ";
+            }
+
+            // 文字没有旋转角度,并且水印字体大小没有包含当前文字时说明是正常文字
+            float rotation = textPosition.getTextMatrix().getValue(0, 1);
+            if (rotation != 0. || heights.contains(h)) {
+                c = " ";
+            }
+
+            float wos = textPosition.getWidthOfSpace();
+
+            TextElement te = new TextElement(Utils.round(textPosition.getYDirAdj() - h, 2),
+                    Utils.round(textPosition.getXDirAdj(), 2), Utils.round(textPosition.getWidthDirAdj(), 2),
+                    Utils.round(textPosition.getHeightDir(), 2), textPosition.getFont(), textPosition.getFontSizeInPt(), c,
+                    // workaround a possible bug in PDFBox:
+                    // https://issues.apache.org/jira/browse/PDFBOX-1755
+                    wos, textPosition.getDir());
+
+            this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
+            this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());
+
+            countHeight++;
+            totalHeight += te.getHeight();
+            float avgHeight = totalHeight / countHeight;
+
+            //We have an issue where tall blank cells throw off the row height calculation
+            //Introspect a blank cell a bit here to see if it should be thrown away
+            if ((te.getText() == null || te.getText().trim().equals(""))) {
+                //if the cell height is more than AVG_HEIGHT_MULT_THRESHOLDxaverage, throw it away
+                if (avgHeight > 0
+                        && te.getHeight() >= (avgHeight * AVG_HEIGHT_MULT_THRESHOLD)) {
+                    continue;
+                }
+
+                //if the font size is outside of reasonable ranges, throw it away
+                if (textPosition.getFontSizeInPt() > MAX_BLANK_FONT_SIZE || textPosition.getFontSizeInPt() < MIN_BLANK_FONT_SIZE) {
+                    continue;
+                }
+            }
+
+            this.spatialIndex.add(te);
+            this.textElements.add(te);
+        }
+    }
+
+    @Override
+    protected float computeFontHeight(PDFont font) throws IOException {
+        BoundingBox bbox = font.getBoundingBox();
+        if (bbox.getLowerLeftY() < Short.MIN_VALUE) {
+            // PDFBOX-2158 and PDFBOX-3130
+            // files by Salmat eSolutions / ClibPDF Library
+            bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
+        }
+        // 1/2 the bbox is used as the height todo: why?
+        float glyphHeight = bbox.getHeight() / 2;
+
+        // sometimes the bbox has very high values, but CapHeight is OK
+        PDFontDescriptor fontDescriptor = font.getFontDescriptor();
+        if (fontDescriptor != null) {
+            float capHeight = fontDescriptor.getCapHeight();
+            if (Float.compare(capHeight, 0) != 0 &&
+                    (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
+                glyphHeight = capHeight;
+            }
+            // PDFBOX-3464, PDFBOX-448:
+            // sometimes even CapHeight has very high value, but Ascent and Descent are ok
+            float ascent = fontDescriptor.getAscent();
+            float descent = fontDescriptor.getDescent();
+            if (ascent > 0 && descent < 0 &&
+                    ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
+                glyphHeight = (ascent - descent) / 2;
+            }
+        }
+
+        // transformPoint from glyph space -> text space
+        float height;
+        if (font instanceof PDType3Font) {
+            height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
+        } else {
+            height = glyphHeight / 1000;
+        }
+
+        return height;
+    }
+
+    private boolean isPrintable(String s) {
+        char c;
+        Character.UnicodeBlock block;
+        boolean printable = false;
+        for (int i = 0; i < s.length(); i++) {
+            c = s.charAt(i);
+            block = Character.UnicodeBlock.of(c);
+            printable |= !Character.isISOControl(c) && block != null && block != Character.UnicodeBlock.SPECIALS;
+        }
+        return printable;
+    }
+
+    public List<TextElement> getTextElements() {
+        return this.textElements;
+    }
+
+    public RectangleSpatialIndex<TextElement> getSpatialIndex() {
+        return spatialIndex;
+    }
+
+    public float getMinCharWidth() {
+        return minCharWidth;
+    }
+
+    public float getMinCharHeight() {
+        return minCharHeight;
+    }
+}

+ 16 - 3
service-daq/src/main/java/com/simuwang/daq/components/report/parser/AbstractReportParser.java

@@ -4,7 +4,6 @@ import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.ReflectUtil;
 import cn.hutool.core.util.StrUtil;
-import com.simuwang.base.common.conts.Constants;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dos.EmailFieldMappingDO;
 import com.simuwang.base.pojo.dto.report.ReportData;
@@ -14,6 +13,11 @@ import org.slf4j.LoggerFactory;
 import java.util.List;
 import java.util.Map;
 
+/**
+ * @author wangzaijun
+ * @date 2024/9/30 18:13
+ * @description 非python接口的报告解析抽象(主要是支持pdf、word和excel等格式)
+ */
 public abstract class AbstractReportParser<T extends ReportData> implements ReportParser<T> {
     protected final Logger logger = LoggerFactory.getLogger(this.getClass());
 
@@ -28,8 +32,10 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
         this.fieldMapper = MapUtil.newHashMap(128);
     }
 
-    @Override
-    public void init() {
+    /**
+     * 初始化数据的方法
+     */
+    protected void init() {
         List<EmailFieldMappingDO> emailFieldMapping = this.fieldMappingMapper.getEmailFieldMapping();
         if (CollUtil.isEmpty(emailFieldMapping)) {
             this.logger.error("未设置报告解析规则!");
@@ -45,6 +51,13 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
     }
 
     /**
+     * 数据清洗,默认啥也不做
+     *
+     * @param reportData 结果数据
+     */
+    protected abstract void cleaningReportData(T reportData);
+
+    /**
      * 对象字段设置
      *
      * @param extInfoMap 名称与值的对应关系

+ 0 - 7
service-daq/src/main/java/com/simuwang/daq/components/report/parser/ReportParser.java

@@ -12,13 +12,6 @@ import java.io.IOException;
  */
 public interface ReportParser<T extends ReportData> {
     /**
-     * 初始化方法,该方法在执行前调用
-     */
-    default void init() {
-
-    }
-
-    /**
      * 获取当前解析器名称
      *
      * @return /

+ 54 - 16
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/AbstractPDReportParser.java

@@ -1,7 +1,9 @@
 package com.simuwang.daq.components.report.parser.pdf;
 
 import cn.hutool.core.collection.CollUtil;
+import cn.hutool.core.collection.ListUtil;
 import cn.hutool.core.util.StrUtil;
+import com.simuwang.base.common.conts.Constants;
 import com.simuwang.base.common.exception.APIException;
 import com.simuwang.base.mapper.EmailFieldMappingMapper;
 import com.simuwang.base.pojo.dto.report.ReportBaseInfoDTO;
@@ -24,6 +26,7 @@ import java.util.Calendar;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 
 /**
  * @author wangzaijun
@@ -31,6 +34,11 @@ import java.util.regex.Pattern;
  * @description pdf格式的报告解析抽象类
  */
 public abstract class AbstractPDReportParser<T extends ReportData> extends AbstractReportParser<T> {
+    /**
+     * 去除了水印的所有文本内容
+     */
+    protected List<String> textList;
+
     public AbstractPDReportParser(EmailFieldMappingMapper fieldMappingMapper) {
         super(fieldMappingMapper);
     }
@@ -41,29 +49,35 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         // 解析报告名称和表格
         String reportName = null;
         try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(params.getFilepath()))) {
-            CustomPDFTextStripper stripper = new CustomPDFTextStripper(document, 1);
+            CustomPDFTextStripper stripper = new CustomPDFTextStripper();
             stripper.setSortByPosition(true);
-            List<String> textList = StrUtil.split(stripper.getText(document), System.lineSeparator());
-            textList.removeIf(StrUtil::isBlank);
-            if (CollUtil.isNotEmpty(textList)) {
-                reportName = this.matchReportName(textList.get(0));
+            String text = stripper.getText(document).replace(Constants.WATERMARK_REPLACE, System.lineSeparator());
+            this.textList = StrUtil.split(text, System.lineSeparator());
+            this.textList.removeIf(StrUtil::isBlank);
+            if (CollUtil.isNotEmpty(this.textList)) {
+                reportName = this.matchReportName(this.textList.get(0));
                 if (StrUtil.isBlank(reportName)) {
                     throw new APIException("未匹配到报告名称");
                 }
             }
             // 解析所有表格
+            List<Table> tables = ListUtil.list(true);
             SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
+            // 自定义表格提取工具,去除单元格中的水印文字
             PageIterator pageIterator = new CustomObjectExtractor(document).extract();
             while (pageIterator.hasNext()) {
                 Page page = pageIterator.next();
-                List<Table> tables = extractionAlgorithm.extract(page);
-                this.initTableInfo(tables);
+                tables.addAll(extractionAlgorithm.extract(page));
             }
+            this.initTableInfo(tables);
         }
         // 解析报告中主体基金的基本信息
         ReportFundInfoDTO reportFundInfo = this.parseFundInfo(params);
         // 解析其他表格信息并且设置结果字段
-        return this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
+        T reportData = this.parseExtInfoAndSetData(reportName, reportFundInfo, params);
+        // 数据清洗后返回
+        this.cleaningReportData(reportData);
+        return reportData;
     }
 
     protected abstract void initTableInfo(List<Table> tables);
@@ -72,6 +86,11 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
 
     protected abstract T parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params);
 
+    @Override
+    protected void cleaningReportData(T reportData) {
+        // cleaning.
+    }
+
     /**
      * 构建报告基本信息
      *
@@ -89,6 +108,33 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
     }
 
     /**
+     * 匹配分级基金名称
+     *
+     * @param text 文本内容
+     * @return /
+     */
+    protected List<String> matchTieredFund(String text) {
+        List<String> matches = ListUtil.list(false);
+        if (StrUtil.isBlank(text)) {
+            return matches;
+        }
+        // 使用正则表达式查找匹配项
+        Pattern pattern = Pattern.compile("[A-F]级|基金[A-F]");
+        Matcher matcher = pattern.matcher(text);
+        // 收集所有匹配项
+        while (matcher.find()) {
+            matches.add(matcher.group());
+        }
+        // 提取字母并按字母顺序排序
+        return matches.stream()
+                .map(s -> s.replaceAll("[^A-F]", ""))
+                .distinct()
+                .sorted()
+                .map(letter -> letter + "级")
+                .collect(Collectors.toList());
+    }
+
+    /**
      * 匹配报告名称
      *
      * @param text 文本内容
@@ -102,12 +148,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         Pattern pat1 = Pattern.compile(".+?报([告表])?\\d{4}(\\.?\\d{1,2}(\\.?\\d{2})?)?");
         Pattern pat2 = Pattern.compile("私募.*披露年度报[告表]((\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}))?");
         Pattern pat3 = Pattern.compile(".+?报([告表])?\\d{4}-\\d{2}-\\d{2}至\\d{4}-\\d{2}-\\d{2}?");
-
         // 创建Matcher对象
         Matcher matcher1 = pat1.matcher(text);
         Matcher matcher2 = pat2.matcher(text);
         Matcher matcher3 = pat3.matcher(text);
-
         // 尝试匹配
         String reportName;
         if (matcher1.find()) {
@@ -132,19 +176,16 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         if (string == null) {
             return null;
         }
-
         // 编译正则表达式模式
         Pattern pat1 = Pattern.compile("(2\\d{3}).*([一二三四1234])季度");  // 2023年XXX3季度
         Pattern pat2 = Pattern.compile("\\d{4}-\\d{2}-\\d{2}");  // 2023-12-31
         Pattern pat3 = Pattern.compile("(2\\d{3})年年度");  // 2023年年度
         Pattern pat4 = Pattern.compile("(\\d{4})年(\\d{1,2})月");  // 2023年12月
-
         // 创建Matcher对象
         Matcher matcher1 = pat1.matcher(string);
         Matcher matcher2 = pat2.matcher(string);
         Matcher matcher3 = pat3.matcher(string);
         Matcher matcher4 = pat4.matcher(string);
-
         // 尝试匹配
         if (matcher1.find()) {
             String year = matcher1.group(1);
@@ -180,13 +221,10 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
         if (string == null) {
             return null;
         }
-
         // 编译正则表达式模式
         Pattern pattern = Pattern.compile("月|季度|年度");
-
         // 创建Matcher对象
         Matcher matcher = pattern.matcher(string);
-
         // 尝试匹配
         if (matcher.find()) {
             return matcher.group();

+ 3 - 1
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDAnnuallyReportParser.java

@@ -23,7 +23,9 @@ public class PDAnnuallyReportParser extends AbstractPDReportParser<AnnuallyRepor
 
     @Override
     protected void initTableInfo(List<Table> tables) {
-
+        for (Table table : tables) {
+            System.out.println(table.getColCount() + "," + table.getRowCount());
+        }
     }
 
     @Override

+ 20 - 4
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDMonthlyReportParser.java

@@ -37,6 +37,7 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
 
     @Override
     protected void initTableInfo(List<Table> tables) {
+        // 一般月报是固定的模板,4列表格是基金基本信息,其他5列的表格是月净值
         for (Table table : tables) {
             int colCount = table.getColCount();
             if (colCount == 4) {
@@ -70,10 +71,11 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
 
     @Override
     protected MonthlyReportData parseExtInfoAndSetData(String reportName, ReportFundInfoDTO fundInfo, ReportParserParams params) {
+        Integer fileId = params.getFileId();
         MonthlyReportData reportData = new MonthlyReportData();
-        reportData.setBaseInfo(this.buildReportInfo(params.getFileId(), reportName));
+        reportData.setBaseInfo(this.buildReportInfo(fileId, reportName));
         reportData.setFundInfo(fundInfo);
-
+        // 母基金和分级基金的净值
         List<ReportNetReportDTO> exts = ListUtil.list(false);
         List<Table> extNavTables = this.extNavTables;
         for (Table extNavTable : extNavTables) {
@@ -84,11 +86,25 @@ public class PDMonthlyReportParser extends AbstractPDReportParser<MonthlyReportD
                 extInfoMap.put(key, value);
             }
             ReportNetReportDTO navInfo = new ReportNetReportDTO();
-            navInfo.setFileId(params.getFileId());
-            buildInfo(extInfoMap, navInfo);
+            navInfo.setFileId(fileId);
+            this.buildInfo(extInfoMap, navInfo);
             exts.add(navInfo);
         }
+        // 分级基金匹配
+        List<String> levels = this.matchTieredFund(String.join(",", this.textList));
+        levels.add(0, "母基金");
+        for (int i = 0; i < exts.size(); i++) {
+            if (levels.size() <= i) {
+                continue;
+            }
+            exts.get(i).setLevel(levels.get(i));
+        }
         reportData.setNetReport(exts);
         return reportData;
     }
+
+    @Override
+    protected void cleaningReportData(MonthlyReportData reportData) {
+        // todo 数据清洗
+    }
 }

+ 3 - 1
service-daq/src/main/java/com/simuwang/daq/components/report/parser/pdf/PDQuarterlyReportParser.java

@@ -28,7 +28,9 @@ public class PDQuarterlyReportParser extends AbstractPDReportParser<QuarterlyRep
 
     @Override
     protected void initTableInfo(List<Table> tables) {
-
+        for (Table table : tables) {
+            System.out.println(table.getColCount() + "," + table.getRowCount());
+        }
     }
 
     @Override

+ 0 - 1
service-daq/src/main/java/com/simuwang/daq/components/report/parser/py/AbstractPyReportParser.java

@@ -38,7 +38,6 @@ public abstract class AbstractPyReportParser<T extends ReportData> implements Re
 
     @Override
     public T parse(ReportParserParams params) throws IOException {
-        this.init();
         Boolean enablePyParser = this.properties.getEnablePyParser();
         if (!enablePyParser) {
             this.logger.error("The python report parser is unavailable!");

+ 2 - 2
service-daq/src/main/java/com/simuwang/daq/service/EmailParseService.java

@@ -832,8 +832,8 @@ public class EmailParseService {
                     emailContentInfoDTOList.add(emailContentInfoDTO);
                 }
                 if (CollUtil.isNotEmpty(emailContentInfoDTOList)) {
-                    // 估值表邮件不展示正文html文件
-                    if (emailType.equals(EmailTypeConst.VALUATION_EMAIL_TYPE)) {
+                    // 估值表或定期报告邮件不展示正文html文件
+                    if (emailType.equals(EmailTypeConst.VALUATION_EMAIL_TYPE) || emailType.equals(EmailTypeConst.REPORT_EMAIL_TYPE)) {
                         emailContentInfoDTOList = emailContentInfoDTOList.stream().filter(e -> !ExcelUtil.isHTML(e.getFilePath())).toList();
                     }
                     emailContentInfoDTOList.forEach(e -> {

+ 5 - 3
service-daq/src/main/java/technology/tabula/CustomObjectExtractor.java

@@ -1,5 +1,6 @@
 package technology.tabula;
 
+import com.simuwang.daq.components.CustomTabulaTextStripper;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 
@@ -7,8 +8,9 @@ import java.io.IOException;
 
 /**
  * @author wangzaijun
- * @date 2024/9/30 11:15
- * @description 重写的
+ * @date 2024/9/30 18:08
+ * @description 自定义的pdf表格提取,重写的目的是为了让自定义的去水印的文本提起工具生效
+ * @see CustomTabulaTextStripper
  */
 public class CustomObjectExtractor extends ObjectExtractor {
     private final PDDocument pdfDocument;
@@ -28,7 +30,7 @@ public class CustomObjectExtractor extends ObjectExtractor {
         ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
         streamEngine.processPage(page);
 
-        TextStripper textStripper = new TextStripper(pdfDocument, pageNumber);
+        CustomTabulaTextStripper textStripper = new CustomTabulaTextStripper(pdfDocument, pageNumber);
         textStripper.process();
 
         Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER);

+ 2 - 2
service-deploy/src/test/java/com/simuwang/ApplicationTest.java

@@ -45,8 +45,8 @@ public class ApplicationTest {
     @Test
     public void reportTest() {
         MailboxInfoDTO emailInfoDTO = this.buildMailbox();
-        Date startDate = DateUtil.parse("2024-09-30 10:50:00", DateConst.YYYY_MM_DD_HH_MM_SS);
-        Date endDate = DateUtil.parse("2024-09-30 19:40:00", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date startDate = DateUtil.parse("2024-09-30 08:59:30", DateConst.YYYY_MM_DD_HH_MM_SS);
+        Date endDate = DateUtil.parse("2024-09-30 09:01:00", DateConst.YYYY_MM_DD_HH_MM_SS);
         try {
             emailParseService.parseEmail(emailInfoDTO, startDate, endDate);
         } catch (Exception e) {