Quellcode durchsuchen

feat:支持列格式的表格解析

wangzaijun vor 1 Monat
Ursprung
Commit
68d984c4e1

+ 10 - 7
mo-daq/src/main/java/com/smppw/modaq/application/components/ReportParseUtils.java

@@ -1,9 +1,7 @@
 package com.smppw.modaq.application.components;
 
 import cn.hutool.core.collection.ListUtil;
-import cn.hutool.core.map.MapUtil;
 import cn.hutool.core.util.StrUtil;
-import cn.hutool.http.HttpUtil;
 import com.smppw.modaq.common.conts.Constants;
 import com.smppw.modaq.common.enums.ReportParseStatus;
 import com.smppw.modaq.common.enums.ReportType;
@@ -11,7 +9,6 @@ import com.smppw.modaq.common.exception.ReportParseException;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
 import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.text.PDFTextStripper;
 import technology.tabula.CustomObjectExtractor;
 import technology.tabula.Page;
 import technology.tabula.PageIterator;
@@ -21,7 +18,6 @@ import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
 import java.io.IOException;
 import java.util.Calendar;
 import java.util.List;
-import java.util.Map;
 import java.util.Objects;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -161,14 +157,18 @@ public final class ReportParseUtils {
 //        ASSET_ALLOCATION_TYPE_MAPPER.put("其他融资总额", "基金负债情况");
 //    }
 
+    public static String cleaningValue(Object value) {
+        return cleaningValue(value, true);
+    }
+
     /**
      * 数据清洗,替换圆括号,包含中文或英文的圆括号
      *
      * @param value /
      * @return /
      */
-    public static String cleaningValue(Object value) {
-        return cleaningValue(value, true);
+    public static String cleaningValue(Object value, boolean replaceEn) {
+        return cleaningValue(value, true, replaceEn);
     }
 
     /**
@@ -178,7 +178,7 @@ public final class ReportParseUtils {
      * @param replaceParentheses 是否替换圆括号
      * @return /
      */
-    public static String cleaningValue(Object value, boolean replaceParentheses) {
+    public static String cleaningValue(Object value, boolean replaceParentheses, boolean replaceEn) {
         String fieldValue = StrUtil.toStringOrNull(value);
         if (!StrUtil.isNullOrUndefined(fieldValue)) {
             // 特殊字符替换,空格替换为空字符
@@ -188,6 +188,9 @@ public final class ReportParseUtils {
                     .replaceAll(":", ":")
                     .replaceAll(" ", StrUtil.EMPTY)
                     .replaceAll(":", StrUtil.EMPTY);
+            if (replaceEn) {
+                fieldValue = fieldValue.replaceAll("[a-zA-Z]", StrUtil.EMPTY);
+            }
             if (replaceParentheses) {
                 // 正则表达式匹配中文括号及其内容,并替换为空字符串
                 fieldValue = Pattern.compile("[(|(][^)]*[)|)]").matcher(fieldValue).replaceAll(StrUtil.EMPTY);

+ 1 - 1
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/AbstractReportParser.java

@@ -95,7 +95,7 @@ public abstract class AbstractReportParser<T extends ReportData> implements Repo
         }
         for (Map.Entry<String, Object> entry : extInfoMap.entrySet()) {
             String k = ReportParseUtils.cleaningValue(entry.getKey());
-            String fieldValue = ReportParseUtils.cleaningValue(entry.getValue());
+            String fieldValue = ReportParseUtils.cleaningValue(entry.getValue(), false);
             String fieldName = this.fieldMapper.get(k);
             if (StrUtil.isBlank(fieldName)) {
                 continue;

+ 0 - 5
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/pdf/AbstractPDReportParser.java

@@ -38,10 +38,6 @@ import java.util.Map;
  */
 public abstract class AbstractPDReportParser<T extends ReportData> extends AbstractReportParser<T> {
     /**
-     * 基金信息表格
-     */
-    protected Table fundInfoTable;
-    /**
      * 去除了水印的所有文本内容
      */
     protected List<String> textList;
@@ -170,7 +166,6 @@ public abstract class AbstractPDReportParser<T extends ReportData> extends Abstr
     protected void init() {
         super.init();
         // 先初始化为null
-        this.fundInfoTable = null;
         this.textList = null;
         this.aiFileId = null;
         this.aiParserContent = null;

+ 36 - 22
mo-daq/src/main/java/com/smppw/modaq/application/components/report/parser/pdf/PDLetterReportParser.java

@@ -36,30 +36,43 @@ public class PDLetterReportParser extends AbstractPDReportParser<LetterReportDat
             }
             return;
         }
-        Table table = tables.get(0);
-        if (table == null) {
-            return;
-        }
-        int rowCount = table.getRowCount();
-        int colCount = table.getColCount();
-        if (rowCount == 2) {
-            for (int i = 0; i < colCount; i++) {
-                String key = ReportParseUtils.cleaningValue(table.getCell(0, i).getText());
-                if (StrUtil.isBlank(key)) {
-                    continue;
-                }
-                this.allInfoMap.put(key, ReportParseUtils.cleaningValue(table.getCell(1, i).getText()));
-            }
-        } else if (colCount % 2 == 0) {
-            for (int i = 0; i < rowCount; i++) {
-                int t = colCount / 2;
-                for (int j = 0; j < t; j++) {
-                    String key = table.getCell(i, j * 2).getText().replaceAll("[a-zA-Z]", "");
-                    key = ReportParseUtils.cleaningValue(key);
+        for (Table table : tables) {
+            int rowCount = table.getRowCount();
+            int colCount = table.getColCount();
+            // 根据特殊行和列处理字段映射关系
+            if (rowCount == 2 || colCount > 4) {
+                // 表格只有2行或者列数大于4列说明每列是一个字段和值
+                for (int i = 0; i < colCount; i++) {
+                    String key = ReportParseUtils.cleaningValue(table.getCell(0, i).getText());
                     if (StrUtil.isBlank(key)) {
                         continue;
                     }
-                    this.allInfoMap.put(key, ReportParseUtils.cleaningValue(table.getCell(i, j * 2 + 1).getText()));
+                    String value = ReportParseUtils.cleaningValue(table.getCell(1, i).getText(), false);
+                    if (Objects.equals("无", value)) {
+                        value = null;
+                    }
+                    if (StrUtil.isNotBlank(value) && value.contains("/")) {
+                        String[] split = value.split("/");
+                        String[] keySplit = key.split("/");
+                        for (int k = 0; k < split.length; k++) {
+                            this.allInfoMap.put(keySplit[k], split[k]);
+                        }
+                    } else {
+                        this.allInfoMap.put(key, value);
+                    }
+                }
+            } else {
+                // 每行的单数列是键,偶数列是值(4列或者2列的表格)
+                for (int i = 0; i < rowCount; i++) {
+                    int t = colCount / 2;
+                    for (int j = 0; j < t; j++) {
+                        String key = ReportParseUtils.cleaningValue(table.getCell(i, j * 2).getText());
+                        if (StrUtil.isBlank(key)) {
+                            continue;
+                        }
+                        String value = table.getCell(i, j * 2 + 1).getText();
+                        this.allInfoMap.put(key, ReportParseUtils.cleaningValue(value, false));
+                    }
                 }
             }
         }
@@ -74,8 +87,8 @@ public class PDLetterReportParser extends AbstractPDReportParser<LetterReportDat
                 fundInfo.setFundCode(fundName.substring(0, fundName.indexOf("_")));
                 fundName = StrUtil.split(fundName, "_").get(1);
                 this.allInfoMap.put("基金代码", fundInfo.getFundCode());
-                this.allInfoMap.put("基金名称", fundName);
             }
+            this.allInfoMap.put("基金名称", fundName);
             fundInfo.setFundName(fundName);
         }
         return fundInfo;
@@ -97,6 +110,7 @@ public class PDLetterReportParser extends AbstractPDReportParser<LetterReportDat
         return reportData;
     }
 
+    @SuppressWarnings("unchecked")
     private static Map<String, Object> flattenMap(Map<String, Object> data, List<String> keys) {
         Map<String, Object> result = MapUtil.newHashMap(16);
         for (Map.Entry<String, Object> entry : data.entrySet()) {