原创

从pdf流中找到指定关键字的位置

依赖

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.1</version>
</dependency>


import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
/**
* @author: 木杉
* @email: 819236727@qq.com
* @Date: 2019/6/15 14:08
* @describe:
*/
public class BoxKeyPosition extends PDFTextStripper {

private char[] key;
private byte[] src;
private List<float[]> list = new ArrayList<float[]>();
private List<float[]> pagelist = new ArrayList<float[]>();

/**
*
* @param keyWords 关键字
* @param src pdf二进制
* @throws IOException
*/
public BoxKeyPosition(String keyWords, byte[] src) throws IOException {
super();
super.setSortByPosition(true);
this.src = src;

char[] key = new char[keyWords.length()];
for (int i = 0; i < keyWords.length(); i++) {
key[i] = keyWords.charAt(i);
}
this.key = key;
}

public char[] getKey() {
return key;
}

public void setKey(char[] key) {
this.key = key;
}

public byte[] getSrc() {
return src;
}

public void setSrc(byte[] src) {
this.src = src;
}

//List<x,y,pageNum> 位置分别为关键字的x y pageNum(第几页)
public List<float[]> getPosition() throws IOException {
try {
document = PDDocument.load(src);
int pages = document.getNumberOfPages();

for (int i = 1; i <= pages; i++) {
pagelist.clear();
super.setSortByPosition(true);
super.setStartPage(i);
super.setEndPage(i);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
super.writeText(document, dummy);
for (float[] li : pagelist) {
li[2] = i;
}
list.addAll(pagelist);
}
return list;

} finally {
if (document != null) {
document.close();
}
}

}

@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (int i = 0; i < textPositions.size(); i++) {

String str = textPositions.get(i).getUnicode();
if (str.equals(key[0] + "")) {
int count = 0;
for (int j = 1; j < key.length; j++) {
String s = "";
try {
s = textPositions.get(i + j).getUnicode();
} catch (Exception e) {
s = "";
}
if (s.equals(key[j] + "")) {
count++;
}

}
if (count == key.length - 1) {
float[] idx = new float[3];
idx[0] = textPositions.get(i).getX()+key.length*textPositions.get(i).getWidth()/2;
idx[1] = textPositions.get(i).getY()-textPositions.get(i).getHeight();
// idx[3] = textPositions.get(i).getUnicode();
pagelist.add(idx);
}
}

}
}
}


正文到此结束