|
@@ -1,370 +0,0 @@
|
|
-//package com.simuwang.daq.utils;
|
|
|
|
-//
|
|
|
|
-//import cn.hutool.core.collection.ListUtil;
|
|
|
|
-//import cn.hutool.core.map.MapUtil;
|
|
|
|
-//import cn.hutool.core.util.ReflectUtil;
|
|
|
|
-//import cn.hutool.core.util.StrUtil;
|
|
|
|
-//import cn.hutool.http.HttpUtil;
|
|
|
|
-//import cn.hutool.json.JSONObject;
|
|
|
|
-//import cn.hutool.json.JSONUtil;
|
|
|
|
-//import com.simuwang.base.common.conts.Constants;
|
|
|
|
-//import com.simuwang.base.pojo.dto.report.PythonResult;
|
|
|
|
-//import com.simuwang.base.pojo.dto.report.ReportFundInfoDTO;
|
|
|
|
-//import com.simuwang.daq.components.CustomPDFTextStripper;
|
|
|
|
-//import com.simuwang.daq.components.PythonReportConverter;
|
|
|
|
-//import com.smppw.common.pojo.ValueLabelVO;
|
|
|
|
-//import org.apache.pdfbox.Loader;
|
|
|
|
-//import org.apache.pdfbox.cos.COSName;
|
|
|
|
-//import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
|
|
|
|
-//import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
|
-//import org.apache.pdfbox.pdmodel.PDPage;
|
|
|
|
-//import org.apache.pdfbox.pdmodel.PDResources;
|
|
|
|
-//import org.apache.pdfbox.pdmodel.common.PDStream;
|
|
|
|
-//import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
|
|
|
-//import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
|
|
|
-//import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
|
-//import technology.tabula.*;
|
|
|
|
-//import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
|
|
|
|
-//
|
|
|
|
-//import java.io.IOException;
|
|
|
|
-//import java.util.*;
|
|
|
|
-//import java.util.regex.Matcher;
|
|
|
|
-//import java.util.regex.Pattern;
|
|
|
|
-//import java.util.stream.Collectors;
|
|
|
|
-//
|
|
|
|
-//public class ReportParseUtil {
|
|
|
|
-// public static void main(String[] args) throws IOException {
|
|
|
|
-//// String fileName = "SJM970_排排精选进取一号私募证券投资基金_2022年第4季度报告.pdf";
|
|
|
|
-//// Pattern pattern = Pattern.compile("S(?:[A-Z]{0}[0-9]{5}|[A-Z][0-9]{4}|[A-Z]{2}[0-9]{3}|[A-Z]{3}[0-9]{2})");
|
|
|
|
-//// Matcher matcher = pattern.matcher(fileName);
|
|
|
|
-//// String registerNumber = null;
|
|
|
|
-//// if (matcher.find()) {
|
|
|
|
-//// registerNumber = matcher.group();
|
|
|
|
-//// }
|
|
|
|
-////
|
|
|
|
-//// int type = 1;
|
|
|
|
-//// String baseUrl = "http://192.168.0.81:8088";
|
|
|
|
-//// String api = "/api/v1/parse/amac_report";
|
|
|
|
-//// Map<String, Object> params = MapUtil.newHashMap(16);
|
|
|
|
-//// params.put("file_id", 111112);
|
|
|
|
-//// params.put("file_path", "E:/workproject/fastparse/src/fastparse/static/reports/quarterly_report/13445.pdf");
|
|
|
|
-//// params.put("register_number", registerNumber);
|
|
|
|
-//// params.put("file_type", type);
|
|
|
|
-//// params.put("file_name", fileName);
|
|
|
|
-//// params.put("fund_name", null);
|
|
|
|
-//// params.put("trust_name", null);
|
|
|
|
-//// String body = HttpUtil.post(baseUrl + api, JSONUtil.toJsonStr(params));
|
|
|
|
-//// JSONObject obj = JSONUtil.parseObj(body);
|
|
|
|
-//// PythonResult<?> result = PythonReportConverter.convert(obj, type);
|
|
|
|
-//// System.out.println(result);
|
|
|
|
-//
|
|
|
|
-// List<ValueLabelVO> fieldMapper = ListUtil.list(false);
|
|
|
|
-// fieldMapper.add(new ValueLabelVO("fundName", "基金名称"));
|
|
|
|
-// fieldMapper.add(new ValueLabelVO("registerNumber", "基金编码"));
|
|
|
|
-// fieldMapper.add(new ValueLabelVO("operationType", "基金运作方式"));
|
|
|
|
-// fieldMapper.add(new ValueLabelVO("fundType", "基金类别"));
|
|
|
|
-// fieldMapper.add(new ValueLabelVO("inceptionDate", "基金成立日期"));
|
|
|
|
-// fieldMapper.add(new ValueLabelVO("trustName", "基金托管人"));
|
|
|
|
-// fieldMapper.add(new ValueLabelVO("custodianName", "基金管理人"));
|
|
|
|
-// fieldMapper.add(new ValueLabelVO("advisorName", "投资顾问"));
|
|
|
|
-// fieldMapper.add(new ValueLabelVO("reviewed", "复核"));
|
|
|
|
-//
|
|
|
|
-//// Map<String, List<String>> watermarkMap = generateWatermarkListMap("幻方量化1000指数专享1号5期私募证券投资基金", "宁波幻方量化投资管理合伙企业(有限合伙)", null);
|
|
|
|
-//// List<String> watermarks = watermarkMap.get("less");
|
|
|
|
-//
|
|
|
|
-//// System.out.println(watermarks);
|
|
|
|
-//// try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("D:\\Documents\\workspace\\idea\\smppw\\data-daq\\service-daq\\src\\main\\java\\com\\simuwang\\daq\\utils\\12931.pdf"))) {
|
|
|
|
-// try (PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile("C:\\Users\\Administrator\\Desktop\\self\\新报告解析\\基协报告\\季报\\SVP311_私募基金季报PDF_国恩回报6号增强私募证券投资基金_2024年06月30日.pdf"))) {
|
|
|
|
-//// PDFTextStripper stripper = new PDFTextStripper();
|
|
|
|
-//// stripper.setSortByPosition(true);
|
|
|
|
-//// String allText = stripper.getText(document);
|
|
|
|
-//// List<String> textList = StrUtil.split(allText, "\r\n");
|
|
|
|
-//// System.out.println(textList);
|
|
|
|
-//
|
|
|
|
-// PDFTextStripper textStripper = new CustomPDFTextStripper();
|
|
|
|
-// textStripper.setSortByPosition(true);
|
|
|
|
-// String text1 = textStripper.getText(document);
|
|
|
|
-// text1 = text1.replace(Constants.WATERMARK_REPLACE, Constants.EMPTY);
|
|
|
|
-// List<String> textList = StrUtil.split(text1, System.lineSeparator());
|
|
|
|
-// textList.removeIf(StrUtil::isBlank);
|
|
|
|
-// System.out.println(textList.get(0));
|
|
|
|
-//
|
|
|
|
-//// for (PDPage page : document.getPages()) {
|
|
|
|
-////
|
|
|
|
-////// PDResources resources = page.getResources();
|
|
|
|
-////// Map<COSName, PDImageXObject> imageXObjectMap = findImageWatermark(page);
|
|
|
|
-////// Iterator<COSName> iterator = resources.getXObjectNames().iterator();
|
|
|
|
-////// while (iterator.hasNext()) {
|
|
|
|
-////// COSName next = iterator.next();
|
|
|
|
-////// if (imageXObjectMap.containsKey(next)) {
|
|
|
|
-////// iterator.remove();
|
|
|
|
-////// }
|
|
|
|
-////// }
|
|
|
|
-////// removeTextWatermark(page);
|
|
|
|
-////
|
|
|
|
-//// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
|
-//// stripper.setSortByPosition(true);
|
|
|
|
-//// stripper.addRegion("page", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
|
-//// stripper.extractRegions(page);
|
|
|
|
-//// for (String region : stripper.getRegions()) {
|
|
|
|
-//// String text = stripper.getTextForRegion(region);
|
|
|
|
-//// String res = processString(watermarks, text);
|
|
|
|
-//// System.out.println("原数据:" + text + ", 去除水印后数据:" + res);
|
|
|
|
-//// }
|
|
|
|
-//// }
|
|
|
|
-//// document.save(new File("./1.pdf"));
|
|
|
|
-//
|
|
|
|
-// SpreadsheetExtractionAlgorithm extractionAlgorithm = new SpreadsheetExtractionAlgorithm();
|
|
|
|
-// PageIterator pageIterator = new ObjectExtractor(document).extract();
|
|
|
|
-// while (pageIterator.hasNext()) {
|
|
|
|
-// Page page = pageIterator.next();
|
|
|
|
-// List<Table> tables = extractionAlgorithm.extract(page);
|
|
|
|
-// tables = tables.stream().distinct().collect(Collectors.toList());
|
|
|
|
-// for (Table table : tables) {
|
|
|
|
-// if (table.getColCount() == 4) {
|
|
|
|
-// Map<String, Object> baseInfoMap = MapUtil.newHashMap(32);
|
|
|
|
-// for (int i = 0; i < table.getRows().size(); i++) {
|
|
|
|
-// List<RectangularTextContainer> cols = table.getRows().get(i);
|
|
|
|
-// for (int j = 0; j < 2; j++) {
|
|
|
|
-// baseInfoMap.put(cols.get(j * 2).getText(), cols.get(j * 2 + 1).getText());
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-// ReportFundInfoDTO reportFundInfo = new ReportFundInfoDTO();
|
|
|
|
-// baseInfoMap.forEach((k, v) -> {
|
|
|
|
-// for (ValueLabelVO vo : fieldMapper) {
|
|
|
|
-// String fieldName = vo.getValue();
|
|
|
|
-// List<String> labels = StrUtil.split(vo.getLabel(), ",");
|
|
|
|
-// if (labels.contains(k)) {
|
|
|
|
-// ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
|
|
|
|
-// break;
|
|
|
|
-// }
|
|
|
|
-// for (String label : labels) {
|
|
|
|
-// if (k.contains(label)) {
|
|
|
|
-// ReflectUtil.setFieldValue(reportFundInfo, fieldName, v);
|
|
|
|
-// break;
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-// });
|
|
|
|
-// System.out.println(reportFundInfo);
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// /**
|
|
|
|
-// * 找图片水印
|
|
|
|
-// *
|
|
|
|
-// * @param page
|
|
|
|
-// * @return
|
|
|
|
-// * @throws IOException
|
|
|
|
-// */
|
|
|
|
-// public static Map<COSName, PDImageXObject> findImageWatermark(PDPage page) throws IOException {
|
|
|
|
-// Map<COSName, PDImageXObject> watermarkMap = MapUtil.newHashMap();
|
|
|
|
-// PDResources resources = page.getResources();
|
|
|
|
-// Iterable<COSName> xObjectNames = resources.getXObjectNames();
|
|
|
|
-// for (COSName xObjectName : xObjectNames) {
|
|
|
|
-// PDXObject xObject = resources.getXObject(xObjectName);
|
|
|
|
-// PDStream stream = xObject.getStream();
|
|
|
|
-// PDImageXObject imageXObject = null;
|
|
|
|
-// try {
|
|
|
|
-// imageXObject = new PDImageXObject(stream, resources);
|
|
|
|
-// } catch (Exception e) {
|
|
|
|
-// e.printStackTrace();
|
|
|
|
-// }
|
|
|
|
-// if (imageXObject != null) {
|
|
|
|
-// watermarkMap.put(xObjectName, imageXObject);
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-// return watermarkMap;
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// private static Map<String, List<String>> generateWatermarkListMap(String fundName, String trustName, String registerNumber) {
|
|
|
|
-// Map<String, List<String>> result = MapUtil.newHashMap(32);
|
|
|
|
-// // 生成水印列表
|
|
|
|
-//
|
|
|
|
-// fundName = StrUtil.isNotBlank(fundName) ? fundName : "私募证券投资基金";
|
|
|
|
-// trustName = StrUtil.isNotBlank(trustName) ? trustName : "资产管理有限公司";
|
|
|
|
-// registerNumber = StrUtil.isNotBlank(registerNumber) ? registerNumber : "";
|
|
|
|
-// String text = fundName + trustName + registerNumber;
|
|
|
|
-// text = text.replaceAll("[()]", ""); // 移除括号
|
|
|
|
-// List<String> textList = new ArrayList<>(new HashSet<>(convertStringToList(text)));
|
|
|
|
-// Collections.reverse(textList);
|
|
|
|
-// StringBuilder sb = new StringBuilder(textList.size());
|
|
|
|
-// for (String ch : textList) {
|
|
|
|
-// sb.append(ch);
|
|
|
|
-// }
|
|
|
|
-// String joinedText = sb.toString();
|
|
|
|
-//
|
|
|
|
-// // 基本水印列表
|
|
|
|
-// List<String> wkList = new ArrayList<>();
|
|
|
|
-// for (String ch : textList) {
|
|
|
|
-// wkList.add(ch + "\r\n");
|
|
|
|
-// wkList.add("\r\n" + ch);
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// // 查找数字
|
|
|
|
-// List<String> matches = findDigits(fundName);
|
|
|
|
-// if (!matches.isEmpty()) {
|
|
|
|
-// for (String match : matches) {
|
|
|
|
-// wkList.add("\r\n" + match);
|
|
|
|
-// wkList.add(match + "\r\n");
|
|
|
|
-// }
|
|
|
|
-// }
|
|
|
|
-// wkList.add("-");
|
|
|
|
-// wkList.add("【");
|
|
|
|
-// wkList.add("】");
|
|
|
|
-// wkList.add("\r");
|
|
|
|
-// wkList.add("\r\n");
|
|
|
|
-//
|
|
|
|
-// String noNumberText = removeDigits(joinedText);
|
|
|
|
-//
|
|
|
|
-// // 生成不同字段的水印列表
|
|
|
|
-// result.put("report_name", new ArrayList<>(wkList));
|
|
|
|
-// result.get("report_name").addAll(convertStringToList("有限公司"));
|
|
|
|
-//
|
|
|
|
-// result.put("less", new ArrayList<>(wkList));
|
|
|
|
-//
|
|
|
|
-// result.put("more", new ArrayList<>(wkList));
|
|
|
|
-// result.get("more").addAll(convertStringToList(noNumberText));
|
|
|
|
-//
|
|
|
|
-// result.put("leverage", new ArrayList<>(wkList));
|
|
|
|
-// result.get("leverage").addAll(convertStringToList(removeKeywords(noNumberText, "基金资产")));
|
|
|
|
-//
|
|
|
|
-// result.put("base_info", new ArrayList<>(wkList));
|
|
|
|
-// result.get("base_info").addAll(convertStringToList(removeKeywords(text, "基", "金", "投资", "管理", "有", "份", "融", "资", "产", "本", "号", "收益", "策略", "期")));
|
|
|
|
-//
|
|
|
|
-// result.put("industry", new ArrayList<>(wkList));
|
|
|
|
-// result.get("industry").addAll(convertStringToList(removeKeywords(noNumberText, "基金融公产")));
|
|
|
|
-//
|
|
|
|
-// result.put("market_value", new ArrayList<>(Collections.singletonList("\n")));
|
|
|
|
-// return result;
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// private static List<String> findDigits(String text) {
|
|
|
|
-// List<String> digits = new ArrayList<>();
|
|
|
|
-// Pattern pattern = Pattern.compile("\\d");
|
|
|
|
-// Matcher matcher = pattern.matcher(text);
|
|
|
|
-// while (matcher.find()) {
|
|
|
|
-// digits.add(matcher.group());
|
|
|
|
-// }
|
|
|
|
-// return digits;
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// private static String removeDigits(String text) {
|
|
|
|
-// return text.replaceAll("\\d", "");
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// private static String removeKeywords(String text, String... keywords) {
|
|
|
|
-// for (String keyword : keywords) {
|
|
|
|
-// text = text.replaceAll(keyword, "");
|
|
|
|
-// }
|
|
|
|
-// return text;
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// private static List<String> convertStringToList(String text) {
|
|
|
|
-// List<String> charList = new ArrayList<>();
|
|
|
|
-// for (char c : text.toCharArray()) {
|
|
|
|
-// charList.add(c + "");
|
|
|
|
-// }
|
|
|
|
-// return charList;
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// public static String processString(List<String> wmList, String string) {
|
|
|
|
-// // 生成正则表达式模式
|
|
|
|
-// String pat = String.join("|", wmList);
|
|
|
|
-// // 使用正则表达式移除wmList中的元素
|
|
|
|
-// string = removeMatches(string, pat);
|
|
|
|
-// // 替换中文括号为英文括号
|
|
|
|
-// string = string.replace("(", "(").replace(")", ")");
|
|
|
|
-// // 移除空格
|
|
|
|
-// string = string.replace(" ", "");
|
|
|
|
-// // 如果字符串以括号开头,则移除第一个字符
|
|
|
|
-// if (startsWithParenthesis(string)) {
|
|
|
|
-// string = string.substring(1);
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// return string;
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// private static String removeMatches(String input, String pattern) {
|
|
|
|
-// // 编译正则表达式
|
|
|
|
-// Pattern compiledPattern = Pattern.compile(pattern);
|
|
|
|
-// // 创建Matcher对象
|
|
|
|
-// Matcher matcher = compiledPattern.matcher(input);
|
|
|
|
-// // 使用replaceAll方法替换所有匹配到的字符为空字符串
|
|
|
|
-// return matcher.replaceAll("");
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-// private static boolean startsWithParenthesis(String input) {
|
|
|
|
-// // 匹配以括号开头的字符串
|
|
|
|
-// Pattern pattern = Pattern.compile("^[()].*");
|
|
|
|
-// Matcher matcher = pattern.matcher(input);
|
|
|
|
-// return matcher.find();
|
|
|
|
-// }
|
|
|
|
-//
|
|
|
|
-//// public static void removeTextWatermark(PDPage page) throws IOException {
|
|
|
|
-//// PDResources resources = page.getResources();
|
|
|
|
-////// if (StrUtil.isAllBlank(fundName, trustName)) {
|
|
|
|
-////// return;
|
|
|
|
-////// }
|
|
|
|
-//// PDFTextStripperByArea stripper = new PDFTextStripperByArea();
|
|
|
|
-//// stripper.setSortByPosition(true);
|
|
|
|
-//// stripper.addRegion("watermark", new Rectangle2D.Float(0, 0, page.getMediaBox().getWidth(), page.getMediaBox().getHeight()));
|
|
|
|
-//// stripper.extractRegions(page);
|
|
|
|
-////
|
|
|
|
-//// PDFStreamEngine engine = new PDFTextStripper();
|
|
|
|
-//// engine.addOperator(new SetMatrix(stripper));
|
|
|
|
-////
|
|
|
|
-//// }
|
|
|
|
-////
|
|
|
|
-//// private static void processResources(PDResources resources) throws IOException {
|
|
|
|
-//// for (COSName name : resources.getXObjectNames()) {
|
|
|
|
-//// PDXObject xobject = resources.getXObject(name);
|
|
|
|
-//// if (xobject instanceof PDFormXObject) {
|
|
|
|
-//// PDFormXObject formXObject = (PDFormXObject) xobject;
|
|
|
|
-//// writeTokensToStream(formXObject.getContentStream(),
|
|
|
|
-//// createTokensWithoutText(formXObject));
|
|
|
|
-//// processResources(formXObject.getResources());
|
|
|
|
-//// }
|
|
|
|
-//// }
|
|
|
|
-//// for (COSName name : resources.getPatternNames()) {
|
|
|
|
-//// PDAbstractPattern pattern = resources.getPattern(name);
|
|
|
|
-//// if (pattern instanceof PDTilingPattern) {
|
|
|
|
-//// PDTilingPattern tilingPattern = (PDTilingPattern) pattern;
|
|
|
|
-//// writeTokensToStream(tilingPattern.getContentStream(),
|
|
|
|
-//// createTokensWithoutText(tilingPattern));
|
|
|
|
-//// processResources(tilingPattern.getResources());
|
|
|
|
-//// }
|
|
|
|
-//// }
|
|
|
|
-//// }
|
|
|
|
-////
|
|
|
|
-//// private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException {
|
|
|
|
-//// try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE)) {
|
|
|
|
-//// ContentStreamWriter writer = new ContentStreamWriter(out);
|
|
|
|
-//// writer.writeTokens(newTokens);
|
|
|
|
-//// }
|
|
|
|
-//// }
|
|
|
|
-////
|
|
|
|
-//// private static List<Object> createTokensWithoutText(PDContentStream contentStream) throws IOException {
|
|
|
|
-//// PDFStreamParser parser = new PDFStreamParser(contentStream);
|
|
|
|
-//// Object token = parser.parseNextToken();
|
|
|
|
-//// List<Object> newTokens = new ArrayList<>();
|
|
|
|
-//// while (token != null) {
|
|
|
|
-//// if (token instanceof Operator op) {
|
|
|
|
-//// String opName = op.getName();
|
|
|
|
-//// if (OperatorName.SET_MATRIX.equals(opName)) {
|
|
|
|
-//// // remove the argument to this operator
|
|
|
|
-//// newTokens.remove(newTokens.size() - 1);
|
|
|
|
-////
|
|
|
|
-//// token = parser.parseNextToken();
|
|
|
|
-//// continue;
|
|
|
|
-//// }
|
|
|
|
-//// }
|
|
|
|
-//// newTokens.add(token);
|
|
|
|
-//// token = parser.parseNextToken();
|
|
|
|
-//// }
|
|
|
|
-//// return newTokens;
|
|
|
|
-//// }
|
|
|
|
-//}
|
|
|