package com.simuwang.daq.service; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.ListUtil; import cn.hutool.core.exceptions.ExceptionUtil; import cn.hutool.core.text.StrBuilder; import cn.hutool.core.util.StrUtil; import com.simuwang.base.common.enums.WaterMarkEnum; import com.simuwang.base.common.util.ExcelUtil; import com.simuwang.base.pojo.dto.ValuationPdfTransformToExcelDTO; import org.apache.commons.io.FileUtils; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.util.CellRangeAddress; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; import org.springframework.web.multipart.MultipartFile; import technology.tabula.*; import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Paths; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; @Component public class PdfToExcelService { private static final Logger log = LoggerFactory.getLogger(PdfToExcelService.class); @Value("${email.file.path}") private String path; private static final List EXTRA_FIELD_LIST = ListUtil.toList("成本", "市值", "估值增值"); private static final List NUMBER_FIELD_LIST = ListUtil.toList("数量", "单位成本", "成本", "成本占净值", "成本占净值%", "成本占净值", "市价", "市值", "市值占净值比", "市值占净值比%"); private static final List TITLE_FIELD_LIST = ListUtil.toList("单位净值:", "位净值:", "估值日期:", "值日期:", "单位:", "科目代码", "科目名", "数量", "单位成本", "位成本", "成本", "成本占净值%", "本币", "十亿千百十", "成本占净值", "市价", "市值", "值", "市值占净值%", "市值占净", "估值增值", "停牌信息", "权益信息"); public static final List TOTAL_MARKET_FIELD = ListUtil.toList("资合计", "资类合计:", "资类合计:", "资净值", "基金资净值:", "基金资净值:", "资产资净值", "资资净值:", "资资净值:", "信托资净值:", "信托资净值:"); /** * PDF类型的估值表文件转为excel类型 * * @param fileList 待转换的文件 * @return 转换为excel类型的文件 */ public List pdfToExcel(List fileList) { if (fileList == null) { return CollUtil.newArrayList(); } List pdfFileList = fileList.stream() .filter(e -> ExcelUtil.isPdf(e.getName())).collect(Collectors.toList()); if (CollUtil.isEmpty(pdfFileList)) { return CollUtil.newArrayList(); } List pdfToExcelDTOList = CollUtil.newArrayList(); String excelUploadDir = path + File.separator + "valuation_table_excel" + File.separator; for (File multipartFile : pdfFileList) { String excelFilePath = excelUploadDir + multipartFile.getName().replace(".pdf", "") + System.currentTimeMillis()+".xlsx"; File savefile = new File(excelFilePath); if (!savefile.exists()) { if (!savefile.getParentFile().exists()) { savefile.getParentFile().mkdirs(); savefile.getParentFile().setExecutable(true); } } try (OutputStream outputStream = Files.newOutputStream(Paths.get(excelFilePath))) { PDDocument document = Loader.loadPDF(multipartFile); //采用pdfBox读取PDF内容 List lines = readTextFromPDFUsingPdfBox(document); //判断PDF是否存在水印 以及 水印类型(waterMarkType为null,代表没有水印) WaterMarkEnum waterMark = getPdfWaterMarkType(lines); //创建一个新的Excel工作簿 Workbook workbook = new XSSFWorkbook(); Sheet sheet = workbook.createSheet("Sheet1"); //1.处理header行(估值表,专用表,日期和单位净值) Integer headerRowNumber = handlerExcelHeader(lines, sheet, waterMark); //采用tabula技术提取表格内容 List lineList = extractTableFormPDFUsingTabula(document, waterMark); //2.额外处理标题行 Integer rowNumber = handlerTitleField(lineList, sheet, headerRowNumber); //招商证券股份有限公司的估值表有特殊情况,存在两个标题字段列 boolean isSpecial = rowNumber == 7; //3.合并标题行单元格 mergeExcelCell(sheet, isSpecial); //4.遍历每一行并将其添加到Excel工作表中 readDateToExcel(sheet, lineList, rowNumber, waterMark); //将Excel工作簿写入输出流 workbook.write(outputStream); document.close(); ValuationPdfTransformToExcelDTO toExcelDTO = new ValuationPdfTransformToExcelDTO(); toExcelDTO.setOriginalFileName(multipartFile.getName()); toExcelDTO.setExcelFile(new File(excelFilePath)); toExcelDTO.setFileUrl(multipartFile.getAbsolutePath()); pdfToExcelDTOList.add(toExcelDTO); } catch (Exception e) { log.error("pdf转为excel报错,堆栈:{}", ExceptionUtil.stacktraceToString(e)); } } return pdfToExcelDTOList; } /** * 估值表_纽达投资利道永晟五号私募证券投资基金_SGL183_202312292024-02-26.pdf * * @param lines pdfBox读取PDF内容中的水印文字 * @param waterMark 水印 */ private List removeWaterMark(List lines, WaterMarkEnum waterMark) { if (waterMark == null) { return lines; } String regex = waterMark.getRegex(); return lines.stream().map(e -> e.replaceAll(regex, "")).filter(StrUtil::isNotBlank).collect(Collectors.toList()); } private WaterMarkEnum getPdfWaterMarkType(List lines) { if (CollUtil.isEmpty(lines)) { return null; } //pdf文本内容 String content = String.join("", lines); for (WaterMarkEnum waterMarkEnum : WaterMarkEnum.values()) { boolean containsAllKeywords = true; String waterMarkText = waterMarkEnum.getText(); String[] waterMarkTextArray = StrUtil.cut(waterMarkText, 1); for (String text : waterMarkTextArray) { if (!content.contains(text)) { containsAllKeywords = false; break; } } if (containsAllKeywords) { return waterMarkEnum; } } return null; } private List extractTableFormPDFUsingTabula(PDDocument document, WaterMarkEnum waterMark) { StrBuilder tableDataText = new StrBuilder(); PageIterator extract = new ObjectExtractor(document).extract(); String regex = waterMark != null ? waterMark.getRegex() : null; if (StrUtil.isBlank(regex)) { while (extract.hasNext()) { Page next = extract.next(); List tableList = new SpreadsheetExtractionAlgorithm().extract(next); for (Table table : tableList) { List> rows = table.getRows(); for (List row : rows) { String collect = row.stream().map(RectangularTextContainer::getText) .map(e -> e.replace(" ", "")) .collect(Collectors.joining(" ")); tableDataText.append("\n").append(collect); } } } } else { while (extract.hasNext()) { Page next = extract.next(); List
tableList = new SpreadsheetExtractionAlgorithm().extract(next); for (Table table : tableList) { List> rows = table.getRows(); for (List row : rows) { String collect = row.stream().map(RectangularTextContainer::getText) .map(e -> e.replace(" ", "")) .map(e -> e.replaceAll(regex, "")) .collect(Collectors.joining(" ")); tableDataText.append("\n").append(collect); } } } } //将文本内容按行分割 return Arrays.stream(tableDataText.toString().trim().split("\\n")).collect(Collectors.toList()); } private List readTextFromPDFUsingPdfBox(PDDocument document) { PDFTextStripper pdfTextStripper = new PDFTextStripper(); pdfTextStripper.setSortByPosition(true); String text = ""; try { text = pdfTextStripper.getText(document); } catch (IOException e) { log.error("error: read text from pdf...", e); } //将文本内容按行分割 return Arrays.stream(text.split("\\n")).collect(Collectors.toList()); } /** * 处理excel的表头行(估值表行,专用表行,日期和单位净值行) * * @param lines PDF每行内容 * @param sheet 表格 * @param waterMark 水印 * @return 返回header所占的行数 */ private Integer handlerExcelHeader(List lines, Sheet sheet, WaterMarkEnum waterMark) { //去掉水印文字(去掉pdfBox读取PDF内容中的水印文字) List noWaterMarkText = removeWaterMark(lines, waterMark); //去掉以空格和换行字符开头的数据行:" \r"(国泰君安证券股份有限公司_嘉禾三号私募证券投资基金_专用表) noWaterMarkText = noWaterMarkText.stream().filter(this::filterHeadLine).collect(Collectors.toList()); //遇到"科目代码"行之前的前三行数据为header行 List headLineList = CollUtil.newArrayList(); for (String line : noWaterMarkText) { if ((StrUtil.isNotBlank(line) && line.trim().startsWith("科目代码"))) { break; } headLineList.add(line); } //head行过滤掉非以"科目"开头的数据行 headLineList = headLineList.stream().filter(e -> e.startsWith("科目") || filterTitleField(e)).collect(Collectors.toList()); //将head行写入表格 for (int i = 0; i < headLineList.size(); i++) { Row row = sheet.createRow(i); List split = splitHeaderRow(headLineList.get(i)); for (int i1 = 0; i1 < split.size(); i1++) { Cell cell = row.createCell(i1); cell.setCellValue(split.get(i1)); } } return sheet.getLastRowNum() + 1; } /** * 分隔和合并head行的内容 * * @param headRowValue head行的内容 * @return 处理后的head行的内容 */ private List splitHeaderRow(String headRowValue) { if (StrUtil.isBlank(headRowValue)) { return CollUtil.newArrayList(); } //国泰君安证券股份有限公司_嘉禾三号私募证券投资基金_专用表(pdfBox识别不到"_") if (headRowValue.contains("国泰君安证券股份有限公司") && !headRowValue.contains("_")) { List collect = Arrays.stream(headRowValue.split(" ")).filter(StrUtil::isNotBlank).collect(Collectors.toList()); return ListUtil.toList(String.join("_", collect)); } return Arrays.stream(headRowValue.split(" ")).filter(StrUtil::isNotBlank).collect(Collectors.toList()); } private boolean filterHeadLine(String text) { if (StrUtil.isBlank(text)) { return false; } if (text.trim().startsWith("成本")) { return false; } String replace = text.replace("\r", "").replace(" ", ""); return !StrUtil.isBlank(replace); } private void readDateToExcel(Sheet sheet, List lines, Integer rowNumber, WaterMarkEnum waterMark) { int baseRowNumber = 0; for (int i = 0; i <= sheet.getLastRowNum(); i++) { if ("科目代码".equals(sheet.getRow(i).getCell(0).getStringCellValue())) { baseRowNumber = i; break; } } for (String field : lines) { //判断field数据是否要写入表格 boolean isNeedToWrite = canNeedToWrite(field); if (!isNeedToWrite) { continue; } Row row = sheet.createRow(rowNumber); List split = Arrays.stream(field.split(" ")).collect(Collectors.toList()); for (int i1 = 0; i1 < split.size(); i1++) { Cell cell = row.createCell(i1); //处理空格的问题 String cellValue = StrUtil.isBlank(split.get(i1)) ? null : split.get(i1).replace("*", ""); //要求为数值的字段,去掉非字符字段 if (StrUtil.isNotBlank(cellValue) && NUMBER_FIELD_LIST.contains(sheet.getRow(baseRowNumber).getCell(i1).getStringCellValue())) { cellValue = filterNotNumber(cellValue); } //特殊处理:科目代码后面带有字母的,如1102.01.01.605056 SH、SQ、CFX等 if (i1 == 0 && i1 + 1 < split.size()) { String nextCellValue = StrUtil.isBlank(split.get(i1 + 1)) ? null : split.get(i1 + 1).replace("*", ""); if (StrUtil.isNotBlank(nextCellValue) && nextCellValue.matches("^[A-Z]+$")) { cellValue = (cellValue + " " + nextCellValue).trim(); split.remove(1); } } cellValue = removeInfluenceOfWaterMark(cellValue, waterMark); cell.setCellValue(cellValue); } rowNumber++; } } /** * 处理水印文字造成的影响(总资产、资产净值两个字段的影响) * * @param cellValue 单元格内容 * @param waterMark 水印 * @return 无水印影响的单元格内容 */ private String removeInfluenceOfWaterMark(String cellValue, WaterMarkEnum waterMark) { if (StrUtil.isBlank(cellValue) || waterMark == null) { return cellValue; } String influenceText = StrUtil.isBlank(waterMark.getInfluenceText()) ? null : waterMark.getInfluenceText(); if (StrUtil.isBlank(influenceText)) { return cellValue; } String rectifyText = waterMark.getRectifyText(); return RestoreCellValue(cellValue, influenceText, rectifyText); } private String RestoreCellValue(String cellValue, String influenceText, String rectifyText) { if (StrUtil.isBlank(cellValue) || StrUtil.isBlank(influenceText) || StrUtil.isBlank(rectifyText)) { return cellValue; } StringBuilder sb = new StringBuilder(); if ("产".equals(influenceText)) { for (int i = 0; i < cellValue.length(); i++) { if (cellValue.charAt(i) == '资') { sb.append(cellValue.charAt(i)); sb.append("产"); } else { sb.append(cellValue.charAt(i)); } } } if ("资".equals(influenceText)) { for (int i = 0; i < cellValue.length(); i++) { if (cellValue.charAt(i) == '产') { sb.append("资"); sb.append(cellValue.charAt(i)); } else { sb.append(cellValue.charAt(i)); } } } return sb.toString(); } private String filterNotNumber(String cellValue) { StringBuilder numberStr = new StringBuilder(); for (int i = 0; i < cellValue.length(); i++) { char c = cellValue.charAt(i); if (Character.isDigit(c) || c == '.' || c=='-') { numberStr.append(c); } } return numberStr.toString(); } private boolean canNeedToWrite(String field) { //过滤以字段列开头的行 boolean isCanContinue = filterTitleField(field); if (!isCanContinue) { return false; } //过滤包含"参考","第","页"的行 if (field.contains("参考") || (field.contains("第") && field.contains("页"))) { return false; } //过滤掉空行 List titleFieldList = Arrays.stream(field.split(" ")).filter(StrUtil::isNotBlank).collect(Collectors.toList()); if (CollUtil.isEmpty(titleFieldList)) { return false; } //过滤只有"%"的数据行 boolean allPercent = titleFieldList.stream().allMatch("%"::equals); return !allPercent; } private boolean filterTitleField(String field) { if (StrUtil.isBlank(field)) { return false; } for (String titleField : TITLE_FIELD_LIST) { if (field.trim().startsWith(titleField)) { return false; } } return true; } /** * 处理excel的字段标题列 * * @param lines PDF内容 * @param sheet excel工作簿 * @param headerRowNumber header行所占的行数 */ private Integer handlerTitleField(List lines, Sheet sheet, Integer headerRowNumber) { Integer rowNumber = headerRowNumber; //1.先筛选出属于标题行的内容 List titleFieldList = getTitleFieldRow(lines); lines.removeAll(titleFieldList); //找到以"科目代码"开头的数据行 int skipNumber = 0; for (String field : titleFieldList) { if (field.trim().startsWith("科目代码")) { break; } skipNumber++; } titleFieldList = titleFieldList.stream().skip(skipNumber).collect(Collectors.toList()); //2.处理字段标题列 //1-字段行占1行:正常的字段列11列 if (titleFieldList.size() == 1) { handleOneRowTitleField(titleFieldList, sheet, rowNumber); return rowNumber + 1; } else if (titleFieldList.size() == 2) { //2-字段行占2行:第2行字段行一般是无用的字段:例如 %等 //处理: 识别不到"原币"、"本币"所在的数据行,导致表格的数据对不上字段列 String secondRowValue = titleFieldList.get(titleFieldList.size() - 1); if (secondRowValue.trim().contains("十亿千百十万千")) { int length = secondRowValue.trim().split(" ").length; if (length >= 6) { String extraRowValue = "原币 本币 原币 本币 原币 本币"; titleFieldList.add(1, extraRowValue); } } if (titleFieldList.size() == 3) { handleThreeRowTitleField(titleFieldList, sheet, rowNumber); rowNumber += 3; } else { handleOneRowTitleField(titleFieldList, sheet, rowNumber); rowNumber++; } return rowNumber; } else if (titleFieldList.size() == 3) { //3-字段行占3行:有原币或者原币、本币字段的17列 handleThreeRowTitleField(titleFieldList, sheet, rowNumber); return rowNumber + 3; } else if (titleFieldList.size() == 4) { //4-字段行占4行:存在两行"科目代码"的特殊字段列,14列(招商证券pdf估值表) handleFourRowTitleField(titleFieldList, sheet, rowNumber); return rowNumber + 4; } return rowNumber; } private void handleFourRowTitleField(List titleFieldList, Sheet sheet, Integer rowNumber) { List fisrtTitleList = Arrays.stream(titleFieldList.get(0).trim().replace("\r", "").split(" ")).collect(Collectors.toList()); List secondTitleList = Arrays.stream(titleFieldList.get(1).trim().replace("\r", "").split(" ")).collect(Collectors.toList()); List thirdTitleList = Arrays.stream(titleFieldList.get(2).trim().replace("\r", "").split(" ")).collect(Collectors.toList()); List fourTitleList = Arrays.stream(titleFieldList.get(3).trim().replace("\r", "").split(" ")).collect(Collectors.toList()); //第一列"科目代码" Row firstRow = sheet.createRow(rowNumber); for (int i = 0; i < fisrtTitleList.size(); i++) { Cell firstCell = firstRow.createCell(i); firstCell.setCellValue(fisrtTitleList.get(i)); } //第二个"科目代码" Row secondRow = sheet.createRow(rowNumber + 1); Row thirdRow = sheet.createRow(rowNumber + 2); Row fourRow = sheet.createRow(rowNumber + 3); int count = 0; for (int index = 0; index < secondTitleList.size(); index++) { Cell secondRowCell = secondRow.createCell(index); Cell thirdCell = thirdRow.createCell(index); Cell fourCell = fourRow.createCell(index); String field = secondTitleList.get(index); if (EXTRA_FIELD_LIST.contains(field)) { secondRowCell.setCellValue(field); thirdCell.setCellValue(thirdTitleList.get(count)); fourCell.setCellValue(fourTitleList.get(count)); count++; } else { secondRowCell.setCellValue(field); thirdCell.setCellValue(field); fourCell.setCellValue(field); } } } private void handleThreeRowTitleField(List titleFieldList, Sheet sheet, Integer rowNumber) { //方案一:先填充每个字段,再合并单元格 List baseTitleList = CollUtil.newArrayList(); List nextTitleList = Arrays.stream(titleFieldList.get(1).trim().replace("\r", "").split(" ")).collect(Collectors.toList()); List next2TitleList = Arrays.stream(titleFieldList.get(2).trim().replace("\r", "").split(" ")).collect(Collectors.toList()); //特殊处理下:如果包含了本币和原币,那么baseTitleList需要重复"成本", "市值", "估值增值"字段 List titleList = Arrays.stream(titleFieldList.get(0).trim().replace("\r", "").split(" ")).collect(Collectors.toList()); if (nextTitleList.contains("原币") && nextTitleList.contains("本币")) { for (String titleField : titleList) { baseTitleList.add(titleField); if (EXTRA_FIELD_LIST.contains(titleField)) { baseTitleList.add(titleField); } } } else if (nextTitleList.contains("本币") || nextTitleList.contains("原币")) { baseTitleList.addAll(titleList); } Row firstRow = sheet.createRow(rowNumber); Row secondRow = sheet.createRow(rowNumber + 1); Row thirdRow = sheet.createRow(rowNumber + 2); int count = 0; for (int index = 0; index < baseTitleList.size(); index++) { Cell firstCell = firstRow.createCell(index); Cell secondRowCell = secondRow.createCell(index); Cell thirdCell = thirdRow.createCell(index); //特殊处理"成本", "市值", "估值增值"字段 String field = baseTitleList.get(index); if (EXTRA_FIELD_LIST.contains(field)) { if (nextTitleList.contains("原币") || nextTitleList.contains("本币")) { firstCell.setCellValue(field); secondRowCell.setCellValue(nextTitleList.get(count)); thirdCell.setCellValue(next2TitleList.get(count)); count++; } } else { firstCell.setCellValue(field); secondRowCell.setCellValue(field); thirdCell.setCellValue(field); } } } private void handleOneRowTitleField(List titleFieldList, Sheet sheet, Integer rowNumber) { Row row = sheet.createRow(rowNumber); List baseTitleList = Arrays.stream(titleFieldList.get(0).split(" ")) .filter(e -> !" ".equals(e) && StrUtil.isNotBlank(e)).collect(Collectors.toList()); for (int i = 0; i < baseTitleList.size(); i++) { Cell cell = row.createCell(i); cell.setCellValue(baseTitleList.get(i)); } } /** * 找到字段标题行 * * @param lines PDF表格的所有行 * @return 字段标题行 */ private List getTitleFieldRow(List lines) { List titleFieldList = CollUtil.newArrayList(); for (String line : lines) { //如果当前行的第一个值是整数(意味着当前的前一行为科目代码行了),那么说明已经找完所有标题行内容 List rowValueList = Arrays.stream(line.split(" ")).collect(Collectors.toList()); String firstValueOfRow = CollUtil.isNotEmpty(rowValueList) ? rowValueList.get(0) : null; if (isInteger(firstValueOfRow)) { break; } if (StrUtil.isNotBlank(line)) { titleFieldList.add(line); } } return titleFieldList; } private void mergeExcelCell(Sheet sheet, boolean isSpecial) { if (sheet == null || sheet.getLastRowNum() < 4) { return; } //先找到"科目代码"字段列在excel中的行号 Integer mixRowNumber = null; Integer maxRowNumber = null; for (int i = 0; i < sheet.getLastRowNum(); i++) { Row row = sheet.getRow(i); //找第一个"科目代码"出现的行数 if ("科目代码".equals(row.getCell(0).toString())) { mixRowNumber = mixRowNumber != null ? mixRowNumber : i; mixRowNumber = isSpecial ? mixRowNumber + 1 : mixRowNumber; isSpecial = false; } //找最后一个"科目代码"出现的行数 if (mixRowNumber != null && "科目代码".equals(row.getCell(0).toString()) && i == sheet.getLastRowNum() - 1) { maxRowNumber = i + 1; break; } } //参数分别为:起始行号、结束行号、起始列号、结束列号 if (mixRowNumber != null && maxRowNumber != null && !mixRowNumber.equals(maxRowNumber)) { Row firstRow = sheet.getRow(mixRowNumber); Row lastRow = sheet.getRow(maxRowNumber); int rowNum = firstRow.getLastCellNum(); for (int i = 0; i < rowNum; i++) { //不同行,同一列的上下单元格合并 if (firstRow.getCell(i).toString().equals(lastRow.getCell(i).toString())) { sheet.addMergedRegion(new CellRangeAddress(mixRowNumber, maxRowNumber, i, i)); } //同一行的左右两个单元格合并 if (i + 1 < rowNum && firstRow.getCell(i).toString().equals(firstRow.getCell(i + 1).toString())) { sheet.addMergedRegion(new CellRangeAddress(mixRowNumber, mixRowNumber, i, i + 1)); } } } } public boolean isInteger(String str) { if (StrUtil.isBlank(str)) { return false; } return str.matches("^-?\\d+$"); } public File saveFile(MultipartFile file, String excelUploadDir, String newExcelName) { File targetFile = null; try { targetFile = new File(excelUploadDir, newExcelName); FileUtils.writeByteArrayToFile(targetFile, file.getBytes()); } catch (IOException e) { e.printStackTrace(); log.error("save file error,error -> {}", e.getMessage()); return targetFile; } return targetFile; } public File saveFile(File file, String excelUploadDir, String newExcelName) { File targetFile = null; try { targetFile = new File(excelUploadDir, newExcelName); FileUtils.writeByteArrayToFile(targetFile, Files.readAllBytes(file.toPath())); } catch (IOException e) { e.printStackTrace(); log.error("save file error,error -> {}", e.getMessage()); return targetFile; } return targetFile; } }