|
@@ -1,42 +1,67 @@
|
|
|
-//package com.simuwang.daq.utils;
|
|
|
-//
|
|
|
-//import cn.hutool.core.collection.CollUtil;
|
|
|
-//import cn.hutool.core.collection.ListUtil;
|
|
|
-//import cn.hutool.core.map.MapUtil;
|
|
|
-//import cn.hutool.core.util.ReflectUtil;
|
|
|
-//import cn.hutool.core.util.StrUtil;
|
|
|
-//import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
|
-//import com.simuwang.daq.dto.ReportFundInfo;
|
|
|
-//import com.smppw.common.pojo.ValueLabelVO;
|
|
|
-//import org.apache.pdfbox.Loader;
|
|
|
-//import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
|
|
-//import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
|
|
-//import org.apache.pdfbox.cos.COSName;
|
|
|
-//import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
|
-//import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
-//import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
-//import org.apache.pdfbox.pdmodel.PDResources;
|
|
|
-//import org.apache.pdfbox.pdmodel.common.PDStream;
|
|
|
-//import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
|
|
|
-//import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
|
|
-//import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
|
-//import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
|
|
-//import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
-//import org.apache.pdfbox.text.PDFTextStripperByArea;
|
|
|
-//import org.apache.pdfbox.text.TextPosition;
|
|
|
-//import org.apache.pdfbox.util.Matrix;
|
|
|
-//import technology.tabula.*;
|
|
|
-//import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
-//
|
|
|
-//import java.awt.geom.Rectangle2D;
|
|
|
-//import java.io.IOException;
|
|
|
-//import java.util.*;
|
|
|
-//import java.util.regex.Matcher;
|
|
|
-//import java.util.regex.Pattern;
|
|
|
-//import java.util.stream.Collectors;
|
|
|
-//
|
|
|
-//public class ReportParseUtil {
|
|
|
-// public static void main(String[] args) throws IOException {
|
|
|
+package com.simuwang.daq.utils;
|
|
|
+
|
|
|
+import cn.hutool.core.collection.CollUtil;
|
|
|
+import cn.hutool.core.collection.ListUtil;
|
|
|
+import cn.hutool.core.map.MapUtil;
|
|
|
+import cn.hutool.core.util.ReflectUtil;
|
|
|
+import cn.hutool.core.util.StrUtil;
|
|
|
+import cn.hutool.http.HttpUtil;
|
|
|
+import cn.hutool.json.JSONObject;
|
|
|
+import cn.hutool.json.JSONUtil;
|
|
|
+import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
|
+import com.simuwang.daq.dto.ReportFundInfo;
|
|
|
+import com.smppw.common.pojo.ValueLabelVO;
|
|
|
+import org.apache.pdfbox.Loader;
|
|
|
+import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
|
|
+import org.apache.pdfbox.contentstream.operator.text.ShowText;
|
|
|
+import org.apache.pdfbox.cos.COSName;
|
|
|
+import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
|
+import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
+import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
+import org.apache.pdfbox.pdmodel.PDResources;
|
|
|
+import org.apache.pdfbox.pdmodel.common.PDStream;
|
|
|
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
|
|
|
+import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
|
|
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
|
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
|
|
+import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
+import org.apache.pdfbox.text.PDFTextStripperByArea;
|
|
|
+import org.apache.pdfbox.text.TextPosition;
|
|
|
+import org.apache.pdfbox.util.Matrix;
|
|
|
+import technology.tabula.*;
|
|
|
+import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
+
|
|
|
+import java.awt.geom.Rectangle2D;
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.*;
|
|
|
+import java.util.regex.Matcher;
|
|
|
+import java.util.regex.Pattern;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+public class ReportParseUtil {
|
|
|
+ public static void main(String[] args) throws IOException {
|
|
|
+ String fileName = "SJM970_排排精选进取一号私募证券投资基金_2022年第4季度报告.pdf";
|
|
|
+ Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
|
|
|
+ Matcher matcher = pattern.matcher(fileName);
|
|
|
+ String registerNumber = null;
|
|
|
+ if (matcher.find()) {
|
|
|
+ registerNumber = matcher.group();
|
|
|
+ }
|
|
|
+
|
|
|
+ String baseUrl = "http://192.168.0.81:8088";
|
|
|
+ String api = "/api/v1/parse/amac_report";
|
|
|
+ Map<String, Object> params = MapUtil.newHashMap(16);
|
|
|
+ params.put("file_id", 111112);
|
|
|
+ params.put("file_path", "E:/workproject/fastparse/src/fastparse/static/reports/quarterly_report/13445.pdf");
|
|
|
+ params.put("register_number", registerNumber);
|
|
|
+ params.put("file_type", 1);
|
|
|
+ params.put("file_name", fileName);
|
|
|
+ params.put("fund_name", null);
|
|
|
+ params.put("trust_name", null);
|
|
|
+ String body = HttpUtil.post(baseUrl + api, JSONUtil.toJsonStr(params));
|
|
|
+ JSONObject obj = JSONUtil.parseObj(body);
|
|
|
+ System.out.println(obj);
|
|
|
+
|
|
|
// List<ValueLabelVO> fieldMapper = ListUtil.list(false);
|
|
|
// fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
|
|
|
// fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
|
|
@@ -129,220 +154,220 @@
|
|
|
// }
|
|
|
// }
|
|
|
// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// /**
|
|
|
-// * 找图片水印
|
|
|
-// *
|
|
|
-// * @param page
|
|
|
-// * @return
|
|
|
-// * @throws IOException
|
|
|
-// */
|
|
|
-// public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
|
|
|
-// Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 找图片水印
|
|
|
+ *
|
|
|
+ * @param page
|
|
|
+ * @return
|
|
|
+ * @throws IOException
|
|
|
+ */
|
|
|
+ public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
|
|
|
+ Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
|
|
|
+ PDResources resources = page.getResources();
|
|
|
+ Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
|
|
+ for (COSName xObjectName : xObjectNames) {
|
|
|
+ PDXObject xObject = resources.getXObject(xObjectName);
|
|
|
+ PDStream stream = xObject.getStream();
|
|
|
+ PDImageXObject imageXObject = null;
|
|
|
+ try {
|
|
|
+ imageXObject = new PDImageXObject(stream, resources);
|
|
|
+ } catch (Exception e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ if (imageXObject != null) {
|
|
|
+ watermarkMap.put(xObjectName, imageXObject);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return watermarkMap;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
|
|
|
+ Map<String, List<String>> result = MapUtil.newHashMap(32);
|
|
|
+ // 生成水印列表
|
|
|
+
|
|
|
+ fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
|
|
|
+ trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
|
|
|
+ registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
|
|
|
+ String text = fundName + trustName + registerNumber;
|
|
|
+ text = text.replaceAll("[()]", ""); // 移除括号
|
|
|
+ List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
|
|
|
+ Collections.reverse(textList);
|
|
|
+ StringBuilder sb = new StringBuilder(textList.size());
|
|
|
+ for (String ch : textList) {
|
|
|
+ sb.append(ch);
|
|
|
+ }
|
|
|
+ String joinedText = sb.toString();
|
|
|
+
|
|
|
+ // 基本水印列表
|
|
|
+ List<String> wkList = new ArrayList<>();
|
|
|
+ for (String ch : textList) {
|
|
|
+ wkList.add(ch + "\r\n");
|
|
|
+ wkList.add("\r\n" + ch);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 查找数字
|
|
|
+ List<String> matches = findDigits(fundName);
|
|
|
+ if (!matches.isEmpty()) {
|
|
|
+ for (String match : matches) {
|
|
|
+ wkList.add("\r\n" + match);
|
|
|
+ wkList.add(match + "\r\n");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ wkList.add("-");
|
|
|
+ wkList.add("【");
|
|
|
+ wkList.add("】");
|
|
|
+ wkList.add("\r");
|
|
|
+ wkList.add("\r\n");
|
|
|
+
|
|
|
+ String noNumberText = removeDigits(joinedText);
|
|
|
+
|
|
|
+ // 生成不同字段的水印列表
|
|
|
+ result.put("report_name", new ArrayList<>(wkList));
|
|
|
+ result.get("report_name").addAll(convertStringToList("有限公司"));
|
|
|
+
|
|
|
+ result.put("less", new ArrayList<>(wkList));
|
|
|
+
|
|
|
+ result.put("more", new ArrayList<>(wkList));
|
|
|
+ result.get("more").addAll(convertStringToList(noNumberText));
|
|
|
+
|
|
|
+ result.put("leverage", new ArrayList<>(wkList));
|
|
|
+ result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
|
|
|
+
|
|
|
+ result.put("base_info", new ArrayList<>(wkList));
|
|
|
+ result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
|
|
|
+
|
|
|
+ result.put("industry", new ArrayList<>(wkList));
|
|
|
+ result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
|
|
|
+
|
|
|
+ result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static List<String> findDigits(String text) {
|
|
|
+ List<String> digits = new ArrayList<>();
|
|
|
+ Pattern pattern = Pattern.compile("\\d");
|
|
|
+ Matcher matcher = pattern.matcher(text);
|
|
|
+ while (matcher.find()) {
|
|
|
+ digits.add(matcher.group());
|
|
|
+ }
|
|
|
+ return digits;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static String removeDigits(String text) {
|
|
|
+ return text.replaceAll("\\d", "");
|
|
|
+ }
|
|
|
+
|
|
|
+ private static String removeKeywords(String text, String... keywords) {
|
|
|
+ for (String keyword : keywords) {
|
|
|
+ text = text.replaceAll(keyword, "");
|
|
|
+ }
|
|
|
+ return text;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static List<String> convertStringToList(String text) {
|
|
|
+ List<String> charList = new ArrayList<>();
|
|
|
+ for (char c : text.toCharArray()) {
|
|
|
+ charList.add(c + "");
|
|
|
+ }
|
|
|
+ return charList;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static String processString(List<String> wmList, String string) {
|
|
|
+ // 生成正则表达式模式
|
|
|
+ String pat = String.join("|", wmList);
|
|
|
+ // 使用正则表达式移除wmList中的元素
|
|
|
+ string = removeMatches(string, pat);
|
|
|
+ // 替换中文括号为英文括号
|
|
|
+ string = string.replace("(", "(").replace(")", ")");
|
|
|
+ // 移除空格
|
|
|
+ string = string.replace(" ", "");
|
|
|
+ // 如果字符串以括号开头,则移除第一个字符
|
|
|
+ if (startsWithParenthesis(string)) {
|
|
|
+ string = string.substring(1);
|
|
|
+ }
|
|
|
+
|
|
|
+ return string;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static String removeMatches(String input, String pattern) {
|
|
|
+ // 编译正则表达式
|
|
|
+ Pattern compiledPattern = Pattern.compile(pattern);
|
|
|
+ // 创建Matcher对象
|
|
|
+ Matcher matcher = compiledPattern.matcher(input);
|
|
|
+ // 使用replaceAll方法替换所有匹配到的字符为空字符串
|
|
|
+ return matcher.replaceAll("");
|
|
|
+ }
|
|
|
+
|
|
|
+ private static boolean startsWithParenthesis(String input) {
|
|
|
+ // 匹配以括号开头的字符串
|
|
|
+ Pattern pattern = Pattern.compile("^[()].*");
|
|
|
+ Matcher matcher = pattern.matcher(input);
|
|
|
+ return matcher.find();
|
|
|
+ }
|
|
|
+
|
|
|
+// public static void removeTextWatermark(PDPage page) throws IOException {
|
|
|
// PDResources resources = page.getResources();
|
|
|
-// Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
|
|
-// for (COSName xObjectName : xObjectNames) {
|
|
|
-// PDXObject xObject = resources.getXObject(xObjectName);
|
|
|
-// PDStream stream = xObject.getStream();
|
|
|
-// PDImageXObject imageXObject = null;
|
|
|
-// try {
|
|
|
-// imageXObject = new PDImageXObject(stream, resources);
|
|
|
-// } catch (Exception e) {
|
|
|
-// e.printStackTrace();
|
|
|
-// }
|
|
|
-// if (imageXObject != null) {
|
|
|
-// watermarkMap.put(xObjectName, imageXObject);
|
|
|
-// }
|
|
|
-// }
|
|
|
-// return watermarkMap;
|
|
|
-// }
|
|
|
+//// if (StrUtil.isAllBlank(fundName, trustName)) {
|
|
|
+//// return;
|
|
|
+//// }
|
|
|
+// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
+// stripper.setSortByPosition(true);
|
|
|
+// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
+// stripper.extractRegions(page);
|
|
|
//
|
|
|
-// private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
|
|
|
-// Map<String, List<String>> result = MapUtil.newHashMap(32);
|
|
|
-// // 生成水印列表
|
|
|
+// PDFStreamEngine engine = new PDFTextStripper();
|
|
|
+// engine.addOperator(new SetMatrix(stripper));
|
|
|
//
|
|
|
-// fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
|
|
|
-// trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
|
|
|
-// registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
|
|
|
-// String text = fundName + trustName + registerNumber;
|
|
|
-// text = text.replaceAll("[()]", ""); // 移除括号
|
|
|
-// List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
|
|
|
-// Collections.reverse(textList);
|
|
|
-// StringBuilder sb = new StringBuilder(textList.size());
|
|
|
-// for (String ch : textList) {
|
|
|
-// sb.append(ch);
|
|
|
-// }
|
|
|
-// String joinedText = sb.toString();
|
|
|
+// }
|
|
|
//
|
|
|
-// // 基本水印列表
|
|
|
-// List<String> wkList = new ArrayList<>();
|
|
|
-// for (String ch : textList) {
|
|
|
-// wkList.add(ch + "\r\n");
|
|
|
-// wkList.add("\r\n" + ch);
|
|
|
+// private static void processResources(PDResources resources) throws IOException {
|
|
|
+// for (COSName name : resources.getXObjectNames()) {
|
|
|
+// PDXObject xobject = resources.getXObject(name);
|
|
|
+// if (xobject instanceof PDFormXObject) {
|
|
|
+// PDFormXObject formXObject = (PDFormXObject) xobject;
|
|
|
+// writeTokensToStream(formXObject.getContentStream(),
|
|
|
+// createTokensWithoutText(formXObject));
|
|
|
+// processResources(formXObject.getResources());
|
|
|
+// }
|
|
|
// }
|
|
|
-//
|
|
|
-// // 查找数字
|
|
|
-// List<String> matches = findDigits(fundName);
|
|
|
-// if (!matches.isEmpty()) {
|
|
|
-// for (String match : matches) {
|
|
|
-// wkList.add("\r\n" + match);
|
|
|
-// wkList.add(match + "\r\n");
|
|
|
+// for (COSName name : resources.getPatternNames()) {
|
|
|
+// PDAbstractPattern pattern = resources.getPattern(name);
|
|
|
+// if (pattern instanceof PDTilingPattern) {
|
|
|
+// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
|
|
|
+// writeTokensToStream(tilingPattern.getContentStream(),
|
|
|
+// createTokensWithoutText(tilingPattern));
|
|
|
+// processResources(tilingPattern.getResources());
|
|
|
// }
|
|
|
// }
|
|
|
-// wkList.add("-");
|
|
|
-// wkList.add("【");
|
|
|
-// wkList.add("】");
|
|
|
-// wkList.add("\r");
|
|
|
-// wkList.add("\r\n");
|
|
|
-//
|
|
|
-// String noNumberText = removeDigits(joinedText);
|
|
|
-//
|
|
|
-// // 生成不同字段的水印列表
|
|
|
-// result.put("report_name", new ArrayList<>(wkList));
|
|
|
-// result.get("report_name").addAll(convertStringToList("有限公司"));
|
|
|
-//
|
|
|
-// result.put("less", new ArrayList<>(wkList));
|
|
|
-//
|
|
|
-// result.put("more", new ArrayList<>(wkList));
|
|
|
-// result.get("more").addAll(convertStringToList(noNumberText));
|
|
|
-//
|
|
|
-// result.put("leverage", new ArrayList<>(wkList));
|
|
|
-// result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
|
|
|
-//
|
|
|
-// result.put("base_info", new ArrayList<>(wkList));
|
|
|
-// result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
|
|
|
-//
|
|
|
-// result.put("industry", new ArrayList<>(wkList));
|
|
|
-// result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
|
|
|
-//
|
|
|
-// result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
|
|
|
-// return result;
|
|
|
// }
|
|
|
//
|
|
|
-// private static List<String> findDigits(String text) {
|
|
|
-// List<String> digits = new ArrayList<>();
|
|
|
-// Pattern pattern = Pattern.compile("\\d");
|
|
|
-// Matcher matcher = pattern.matcher(text);
|
|
|
-// while (matcher.find()) {
|
|
|
-// digits.add(matcher.group());
|
|
|
+// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
|
|
|
+// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
|
|
|
+// ContentStreamWriter writer = new ContentStreamWriter(out);
|
|
|
+// writer.writeTokens(newTokens);
|
|
|
// }
|
|
|
-// return digits;
|
|
|
-// }
|
|
|
-//
|
|
|
-// private static String removeDigits(String text) {
|
|
|
-// return text.replaceAll("\\d", "");
|
|
|
// }
|
|
|
//
|
|
|
-// private static String removeKeywords(String text, String... keywords) {
|
|
|
-// for (String keyword : keywords) {
|
|
|
-// text = text.replaceAll(keyword, "");
|
|
|
-// }
|
|
|
-// return text;
|
|
|
-// }
|
|
|
+// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
|
|
|
+// PDFStreamParser parser = new PDFStreamParser(contentStream);
|
|
|
+// Object token = parser.parseNextToken();
|
|
|
+// List<Object> newTokens = new ArrayList<>();
|
|
|
+// while (token != null) {
|
|
|
+// if (token instanceof Operator op) {
|
|
|
+// String opName = op.getName();
|
|
|
+// if (OperatorName.SET_MATRIX.equals(opName)) {
|
|
|
+// // remove the argument to this operator
|
|
|
+// newTokens.remove(newTokens.size() - 1);
|
|
|
//
|
|
|
-// private static List<String> convertStringToList(String text) {
|
|
|
-// List<String> charList = new ArrayList<>();
|
|
|
-// for (char c : text.toCharArray()) {
|
|
|
-// charList.add(c + "");
|
|
|
-// }
|
|
|
-// return charList;
|
|
|
-// }
|
|
|
-//
|
|
|
-// public static String processString(List<String> wmList, String string) {
|
|
|
-// // 生成正则表达式模式
|
|
|
-// String pat = String.join("|", wmList);
|
|
|
-// // 使用正则表达式移除wmList中的元素
|
|
|
-// string = removeMatches(string, pat);
|
|
|
-// // 替换中文括号为英文括号
|
|
|
-// string = string.replace("(", "(").replace(")", ")");
|
|
|
-// // 移除空格
|
|
|
-// string = string.replace(" ", "");
|
|
|
-// // 如果字符串以括号开头,则移除第一个字符
|
|
|
-// if (startsWithParenthesis(string)) {
|
|
|
-// string = string.substring(1);
|
|
|
+// token = parser.parseNextToken();
|
|
|
+// continue;
|
|
|
+// }
|
|
|
+// }
|
|
|
+// newTokens.add(token);
|
|
|
+// token = parser.parseNextToken();
|
|
|
// }
|
|
|
-//
|
|
|
-// return string;
|
|
|
+// return newTokens;
|
|
|
// }
|
|
|
-//
|
|
|
-// private static String removeMatches(String input, String pattern) {
|
|
|
-// // 编译正则表达式
|
|
|
-// Pattern compiledPattern = Pattern.compile(pattern);
|
|
|
-// // 创建Matcher对象
|
|
|
-// Matcher matcher = compiledPattern.matcher(input);
|
|
|
-// // 使用replaceAll方法替换所有匹配到的字符为空字符串
|
|
|
-// return matcher.replaceAll("");
|
|
|
-// }
|
|
|
-//
|
|
|
-// private static boolean startsWithParenthesis(String input) {
|
|
|
-// // 匹配以括号开头的字符串
|
|
|
-// Pattern pattern = Pattern.compile("^[()].*");
|
|
|
-// Matcher matcher = pattern.matcher(input);
|
|
|
-// return matcher.find();
|
|
|
-// }
|
|
|
-//
|
|
|
-//// public static void removeTextWatermark(PDPage page) throws IOException {
|
|
|
-//// PDResources resources = page.getResources();
|
|
|
-////// if (StrUtil.isAllBlank(fundName, trustName)) {
|
|
|
-////// return;
|
|
|
-////// }
|
|
|
-//// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
-//// stripper.setSortByPosition(true);
|
|
|
-//// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
-//// stripper.extractRegions(page);
|
|
|
-////
|
|
|
-//// PDFStreamEngine engine = new PDFTextStripper();
|
|
|
-//// engine.addOperator(new SetMatrix(stripper));
|
|
|
-////
|
|
|
-//// }
|
|
|
-////
|
|
|
-//// private static void processResources(PDResources resources) throws IOException {
|
|
|
-//// for (COSName name : resources.getXObjectNames()) {
|
|
|
-//// PDXObject xobject = resources.getXObject(name);
|
|
|
-//// if (xobject instanceof PDFormXObject) {
|
|
|
-//// PDFormXObject formXObject = (PDFormXObject) xobject;
|
|
|
-//// writeTokensToStream(formXObject.getContentStream(),
|
|
|
-//// createTokensWithoutText(formXObject));
|
|
|
-//// processResources(formXObject.getResources());
|
|
|
-//// }
|
|
|
-//// }
|
|
|
-//// for (COSName name : resources.getPatternNames()) {
|
|
|
-//// PDAbstractPattern pattern = resources.getPattern(name);
|
|
|
-//// if (pattern instanceof PDTilingPattern) {
|
|
|
-//// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
|
|
|
-//// writeTokensToStream(tilingPattern.getContentStream(),
|
|
|
-//// createTokensWithoutText(tilingPattern));
|
|
|
-//// processResources(tilingPattern.getResources());
|
|
|
-//// }
|
|
|
-//// }
|
|
|
-//// }
|
|
|
-////
|
|
|
-//// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
|
|
|
-//// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
|
|
|
-//// ContentStreamWriter writer = new ContentStreamWriter(out);
|
|
|
-//// writer.writeTokens(newTokens);
|
|
|
-//// }
|
|
|
-//// }
|
|
|
-////
|
|
|
-//// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
|
|
|
-//// PDFStreamParser parser = new PDFStreamParser(contentStream);
|
|
|
-//// Object token = parser.parseNextToken();
|
|
|
-//// List<Object> newTokens = new ArrayList<>();
|
|
|
-//// while (token != null) {
|
|
|
-//// if (token instanceof Operator op) {
|
|
|
-//// String opName = op.getName();
|
|
|
-//// if (OperatorName.SET_MATRIX.equals(opName)) {
|
|
|
-//// // remove the argument to this operator
|
|
|
-//// newTokens.remove(newTokens.size() - 1);
|
|
|
-////
|
|
|
-//// token = parser.parseNextToken();
|
|
|
-//// continue;
|
|
|
-//// }
|
|
|
-//// }
|
|
|
-//// newTokens.add(token);
|
|
|
-//// token = parser.parseNextToken();
|
|
|
-//// }
|
|
|
-//// return newTokens;
|
|
|
-//// }
|
|
|
-//}
|
|
|
+}
|