|
@@ -0,0 +1,190 @@
|
|
|
+package com.simuwang.daq.components;
|
|
|
+
|
|
|
+import org.apache.fontbox.util.BoundingBox;
|
|
|
+import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
+import org.apache.pdfbox.pdmodel.font.PDFont;
|
|
|
+import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
|
|
|
+import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
|
|
+import org.apache.pdfbox.text.TextPosition;
|
|
|
+import technology.tabula.RectangleSpatialIndex;
|
|
|
+import technology.tabula.TextElement;
|
|
|
+import technology.tabula.TextStripper;
|
|
|
+import technology.tabula.Utils;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.List;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+/**
|
|
|
+ * @author wangzaijun
|
|
|
+ * @date 2024/9/12 14:00
|
|
|
+ * @description 自定义的文本去水印方法,发现水印基本是旋转文字并且比报告内其他文字都大;主要依据文本旋转角度和字体大小判断是否为水印
|
|
|
+ */
|
|
|
+public class CustomTabulaTextStripper extends TextStripper {
|
|
|
+ private static final String NBSP = "\u00A0";
|
|
|
+ private static final float AVG_HEIGHT_MULT_THRESHOLD = 6.0f;
|
|
|
+ private static final float MAX_BLANK_FONT_SIZE = 40.0f;
|
|
|
+ private static final float MIN_BLANK_FONT_SIZE = 2.0f;
|
|
|
+ private final PDDocument document;
|
|
|
+ private final ArrayList<TextElement> textElements;
|
|
|
+ private final RectangleSpatialIndex<TextElement> spatialIndex;
|
|
|
+ private float minCharWidth = Float.MAX_VALUE;
|
|
|
+ private float minCharHeight = Float.MAX_VALUE;
|
|
|
+ private float totalHeight = 0.0f;
|
|
|
+ private int countHeight = 0;
|
|
|
+
|
|
|
+ public CustomTabulaTextStripper(PDDocument document, int pageNumber) throws IOException {
|
|
|
+ super(document, pageNumber);
|
|
|
+ this.document = document;
|
|
|
+ this.setStartPage(pageNumber);
|
|
|
+ this.setEndPage(pageNumber);
|
|
|
+ this.textElements = new ArrayList<>();
|
|
|
+ this.spatialIndex = new RectangleSpatialIndex<>();
|
|
|
+ }
|
|
|
+
|
|
|
+ public void process() throws IOException {
|
|
|
+ this.getText(this.document);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ protected void writeString(String string, List<TextPosition> textPositions) {
|
|
|
+ // 有旋转角度的文字
|
|
|
+ List<TextPosition> rotationTexts = textPositions.stream()
|
|
|
+ .filter(e -> e.getTextMatrix().getValue(0, 1) != 0.).collect(Collectors.toList());
|
|
|
+ // 水印文字基本都是有角度的,统计有旋转角度的文字高度
|
|
|
+ List<Float> heights = rotationTexts.stream().map(TextPosition::getHeight).collect(Collectors.toList());
|
|
|
+ // 如果全是水印文字则直接去除
|
|
|
+ if (textPositions.size() == heights.size()) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 其他场景需要写TextElement属性
|
|
|
+ for (TextPosition textPosition : textPositions) {
|
|
|
+ if (textPosition == null) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ String c = textPosition.getUnicode();
|
|
|
+
|
|
|
+ // if c not printable, return
|
|
|
+ if (!isPrintable(c)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ float h = textPosition.getHeightDir();
|
|
|
+
|
|
|
+ if (c.equals(NBSP)) { // replace non-breaking space for space
|
|
|
+ c = " ";
|
|
|
+ }
|
|
|
+
|
|
|
+ // 文字没有旋转角度,并且水印字体大小没有包含当前文字时说明是正常文字
|
|
|
+ float rotation = textPosition.getTextMatrix().getValue(0, 1);
|
|
|
+ if (rotation != 0. || heights.contains(h)) {
|
|
|
+ c = " ";
|
|
|
+ }
|
|
|
+
|
|
|
+ float wos = textPosition.getWidthOfSpace();
|
|
|
+
|
|
|
+ TextElement te = new TextElement(Utils.round(textPosition.getYDirAdj() - h, 2),
|
|
|
+ Utils.round(textPosition.getXDirAdj(), 2), Utils.round(textPosition.getWidthDirAdj(), 2),
|
|
|
+ Utils.round(textPosition.getHeightDir(), 2), textPosition.getFont(), textPosition.getFontSizeInPt(), c,
|
|
|
+ // workaround a possible bug in PDFBox:
|
|
|
+ // https://issues.apache.org/jira/browse/PDFBOX-1755
|
|
|
+ wos, textPosition.getDir());
|
|
|
+
|
|
|
+ this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
|
|
|
+ this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());
|
|
|
+
|
|
|
+ countHeight++;
|
|
|
+ totalHeight += te.getHeight();
|
|
|
+ float avgHeight = totalHeight / countHeight;
|
|
|
+
|
|
|
+ //We have an issue where tall blank cells throw off the row height calculation
|
|
|
+ //Introspect a blank cell a bit here to see if it should be thrown away
|
|
|
+ if ((te.getText() == null || te.getText().trim().equals(""))) {
|
|
|
+ //if the cell height is more than AVG_HEIGHT_MULT_THRESHOLDxaverage, throw it away
|
|
|
+ if (avgHeight > 0
|
|
|
+ && te.getHeight() >= (avgHeight * AVG_HEIGHT_MULT_THRESHOLD)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ //if the font size is outside of reasonable ranges, throw it away
|
|
|
+ if (textPosition.getFontSizeInPt() > MAX_BLANK_FONT_SIZE || textPosition.getFontSizeInPt() < MIN_BLANK_FONT_SIZE) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ this.spatialIndex.add(te);
|
|
|
+ this.textElements.add(te);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ protected float computeFontHeight(PDFont font) throws IOException {
|
|
|
+ BoundingBox bbox = font.getBoundingBox();
|
|
|
+ if (bbox.getLowerLeftY() < Short.MIN_VALUE) {
|
|
|
+ // PDFBOX-2158 and PDFBOX-3130
|
|
|
+ // files by Salmat eSolutions / ClibPDF Library
|
|
|
+ bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
|
|
|
+ }
|
|
|
+ // 1/2 the bbox is used as the height todo: why?
|
|
|
+ float glyphHeight = bbox.getHeight() / 2;
|
|
|
+
|
|
|
+ // sometimes the bbox has very high values, but CapHeight is OK
|
|
|
+ PDFontDescriptor fontDescriptor = font.getFontDescriptor();
|
|
|
+ if (fontDescriptor != null) {
|
|
|
+ float capHeight = fontDescriptor.getCapHeight();
|
|
|
+ if (Float.compare(capHeight, 0) != 0 &&
|
|
|
+ (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
|
|
|
+ glyphHeight = capHeight;
|
|
|
+ }
|
|
|
+ // PDFBOX-3464, PDFBOX-448:
|
|
|
+ // sometimes even CapHeight has very high value, but Ascent and Descent are ok
|
|
|
+ float ascent = fontDescriptor.getAscent();
|
|
|
+ float descent = fontDescriptor.getDescent();
|
|
|
+ if (ascent > 0 && descent < 0 &&
|
|
|
+ ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0)) {
|
|
|
+ glyphHeight = (ascent - descent) / 2;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // transformPoint from glyph space -> text space
|
|
|
+ float height;
|
|
|
+ if (font instanceof PDType3Font) {
|
|
|
+ height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
|
|
|
+ } else {
|
|
|
+ height = glyphHeight / 1000;
|
|
|
+ }
|
|
|
+
|
|
|
+ return height;
|
|
|
+ }
|
|
|
+
|
|
|
+ private boolean isPrintable(String s) {
|
|
|
+ char c;
|
|
|
+ Character.UnicodeBlock block;
|
|
|
+ boolean printable = false;
|
|
|
+ for (int i = 0; i < s.length(); i++) {
|
|
|
+ c = s.charAt(i);
|
|
|
+ block = Character.UnicodeBlock.of(c);
|
|
|
+ printable |= !Character.isISOControl(c) && block != null && block != Character.UnicodeBlock.SPECIALS;
|
|
|
+ }
|
|
|
+ return printable;
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<TextElement> getTextElements() {
|
|
|
+ return this.textElements;
|
|
|
+ }
|
|
|
+
|
|
|
+ public RectangleSpatialIndex<TextElement> getSpatialIndex() {
|
|
|
+ return spatialIndex;
|
|
|
+ }
|
|
|
+
|
|
|
+ public float getMinCharWidth() {
|
|
|
+ return minCharWidth;
|
|
|
+ }
|
|
|
+
|
|
|
+ public float getMinCharHeight() {
|
|
|
+ return minCharHeight;
|
|
|
+ }
|
|
|
+}
|