我正在尝试从 PDF 中提取文本坐标和行(或矩形)坐标。
The TextPosition
班级有getXDirAdj()
and getYDirAdj()
根据相应 TextPosition 对象表示的文本片段的方向转换坐标的方法(根据 @mkl 的评论进行更正)
无论页面旋转如何,最终输出都是一致的。
输出所需的坐标是 X0,Y0(页面的左上角)
这是对 @Tilman Hausherr 的解决方案的轻微修改。 y 坐标被反转(高度 - y)以使其与文本提取过程中的坐标保持一致,并且输出也写入 csv。
public class LineCatcher extends PDFGraphicsStreamEngine
{
private static final GeneralPath linePath = new GeneralPath();
private static ArrayList<Rectangle2D> rectList= new ArrayList<Rectangle2D>();
private int clipWindingRule = -1;
private static String headerRecord = "Text|Page|x|y|width|height|space|font";
public LineCatcher(PDPage page)
{
super(page);
}
public static void main(String[] args) throws IOException
{
if( args.length != 4 )
{
usage();
}
else
{
PDDocument document = null;
FileOutputStream fop = null;
File file;
Writer osw = null;
int numPages;
double page_height;
try
{
document = PDDocument.load( new File(args[0], args[1]) );
numPages = document.getNumberOfPages();
file = new File(args[2], args[3]);
fop = new FileOutputStream(file);
// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
}
osw = new OutputStreamWriter(fop, "UTF8");
osw.write(headerRecord + System.lineSeparator());
System.out.println("Line Processing numPages:" + numPages);
for (int n = 0; n < numPages; n++) {
System.out.println("Line Processing page:" + n);
rectList = new ArrayList<Rectangle2D>();
PDPage page = document.getPage(n);
page_height = page.getCropBox().getUpperRightY();
LineCatcher lineCatcher = new LineCatcher(page);
lineCatcher.processPage(page);
try{
for(Rectangle2D rect:rectList) {
String pageNum = Integer.toString(n + 1);
String x = Double.toString(rect.getX());
String y = Double.toString(page_height - rect.getY()) ;
String w = Double.toString(rect.getWidth());
String h = Double.toString(rect.getHeight());
writeToFile(pageNum, x, y, w, h, osw);
}
rectList = null;
page = null;
lineCatcher = null;
}
catch(IOException io){
throw new IOException("Failed to Parse document for line processing. Incorrect document format. Page:" + n);
}
};
}
catch(IOException io){
throw new IOException("Failed to Parse document for line processing. Incorrect document format.");
}
finally
{
if ( osw != null ){
osw.close();
}
if( document != null )
{
document.close();
}
}
}
}
private static void writeToFile(String pageNum, String x, String y, String w, String h, Writer osw) throws IOException {
String c = "^" + "|" +
pageNum + "|" +
x + "|" +
y + "|" +
w + "|" +
h + "|" +
"999" + "|" +
"marker-only";
osw.write(c + System.lineSeparator());
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException
{
// to ensure that the path is created in the right direction, we have to create
// it by combining single lines instead of creating a simple rectangle
linePath.moveTo((float) p0.getX(), (float) p0.getY());
linePath.lineTo((float) p1.getX(), (float) p1.getY());
linePath.lineTo((float) p2.getX(), (float) p2.getY());
linePath.lineTo((float) p3.getX(), (float) p3.getY());
// close the subpath instead of adding the last line so that a possible set line
// cap style isn't taken into account at the "beginning" of the rectangle
linePath.closePath();
}
@Override
public void drawImage(PDImage pdi) throws IOException
{
}
@Override
public void clip(int windingRule) throws IOException
{
// the clipping path will not be updated until the succeeding painting operator is called
clipWindingRule = windingRule;
}
@Override
public void moveTo(float x, float y) throws IOException
{
linePath.moveTo(x, y);
}
@Override
public void lineTo(float x, float y) throws IOException
{
linePath.lineTo(x, y);
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException
{
linePath.curveTo(x1, y1, x2, y2, x3, y3);
}
@Override
public Point2D getCurrentPoint() throws IOException
{
return linePath.getCurrentPoint();
}
@Override
public void closePath() throws IOException
{
linePath.closePath();
}
@Override
public void endPath() throws IOException
{
if (clipWindingRule != -1)
{
linePath.setWindingRule(clipWindingRule);
getGraphicsState().intersectClippingPath(linePath);
clipWindingRule = -1;
}
linePath.reset();
}
@Override
public void strokePath() throws IOException
{
rectList.add(linePath.getBounds2D());
linePath.reset();
}
@Override
public void fillPath(int windingRule) throws IOException
{
linePath.reset();
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException
{
linePath.reset();
}
@Override
public void shadingFill(COSName cosn) throws IOException
{
}
/**
* This will print the usage for this document.
*/
private static void usage()
{
System.err.println( "Usage: java " + LineCatcher.class.getName() + " <input-pdf>" + " <output-file>");
}
}
正在使用PDFGraphicsStreamEngine
类来提取线和矩形坐标。线条和矩形的坐标与文本的坐标不对齐
绿色:文本
红色:按原样获取的线坐标
黑色:预期坐标(对输出应用变换后获得)
尝试过setRotation()
在运行线提取之前校正旋转的方法。但结果并不一致。
使用 PDFBox 获得旋转并获得一致的线/矩形坐标输出有哪些可能的选项?