IText 像 pdftotext -layout 一样读取 PDF?


我正在寻找实现 java 解决方案的最简单方法,该解决方案与以下输出类似:

pdftotext -layout FILE

在 Linux 机器上。 (当然价格也应该便宜)

我刚刚尝试了 IText、PDFBox 和 PDFTextStream 的一些代码片段。迄今为止最准确的解决方案是 PDFTextStream,它使用 VisualOutputTarget 来获得我的文件的良好表示。

所以我的列布局被认为是正确的,我可以使用它。 但是IText也应该有一个解决方案,或者?


// 我按照 mkl 的说明编写了自己的策略对象,如下所示:

package com.test.pdfextractiontest.itext;

import ...

public class MyLocationTextExtractionStrategy implements TextExtractionStrategy {

    /** set to true for debugging */
    static boolean DUMP_STATE = false;

    /** a summary of all found text */
    private final List<TextChunk> locationalResult = new ArrayList<TextChunk>();

    public MyLocationTextExtractionStrategy() {

    public void beginTextBlock() {

    public void endTextBlock() {

    private boolean startsWithSpace(final String str) {
        if (str.length() == 0) {
            return false;
        return str.charAt(0) == ' ';

    private boolean endsWithSpace(final String str) {
        if (str.length() == 0) {
            return false;
        return str.charAt(str.length() - 1) == ' ';

    private List<TextChunk> filterTextChunks(final List<TextChunk> textChunks, final TextChunkFilter filter) {
        if (filter == null) {
            return textChunks;

        final List<TextChunk> filtered = new ArrayList<TextChunk>();
        for (final TextChunk textChunk : textChunks) {
            if (filter.accept(textChunk)) {
        return filtered;

    protected boolean isChunkAtWordBoundary(final TextChunk chunk, final TextChunk previousChunk) {
        final float dist = chunk.distanceFromEndOf(previousChunk);

        if (dist < -chunk.getCharSpaceWidth() || dist > chunk.getCharSpaceWidth() / 2.0f) {
            return true;

        return false;

    public String getResultantText(final TextChunkFilter chunkFilter) {
        if (DUMP_STATE) {

        final List<TextChunk> filteredTextChunks = filterTextChunks(this.locationalResult, chunkFilter);

        final StringBuffer sb = new StringBuffer();
        TextChunk lastChunk = null;
        for (final TextChunk chunk : filteredTextChunks) {

            if (lastChunk == null) {
            } else {
                if (chunk.sameLine(lastChunk)) {

                    if (isChunkAtWordBoundary(chunk, lastChunk) && !startsWithSpace(chunk.text)
                            && !endsWithSpace(lastChunk.text)) {
                        sb.append(' ');
                    final Float dist = chunk.distanceFromEndOf(lastChunk)/3;
                    for(int i = 0; i<Math.round(dist); i++) {
                        sb.append(' ');
                } else {
            lastChunk = chunk;

        return sb.toString();

返回包含结果文本的字符串。 */ @Override public String get Resultant Text() {

        return getResultantText(null);


    private void dumpState() {
        for (final TextChunk location : this.locationalResult) {



    public void renderText(final TextRenderInfo renderInfo) {
        LineSegment segment = renderInfo.getBaseline();
        if (renderInfo.getRise() != 0) { 

            final Matrix riseOffsetTransform = new Matrix(0, -renderInfo.getRise());
            segment = segment.transformBy(riseOffsetTransform);
        final TextChunk location =
                new TextChunk(renderInfo.getText(), segment.getStartPoint(), segment.getEndPoint(),

    public static class TextChunk implements Comparable<TextChunk> {
        /** the text of the chunk */
        private final String text;
        /** the starting location of the chunk */
        private final Vector startLocation;
        /** the ending location of the chunk */
        private final Vector endLocation;
        /** unit vector in the orientation of the chunk */
        private final Vector orientationVector;
        /** the orientation as a scalar for quick sorting */
        private final int orientationMagnitude;

        private final TextRenderInfo info;

        private final int distPerpendicular;

        private final float distParallelStart;

        private final float distParallelEnd;
        /** the width of a single space character in the font of the chunk */
        private final float charSpaceWidth;

        public TextChunk(final String string, final Vector startLocation, final Vector endLocation,
                final float charSpaceWidth,final TextRenderInfo ri) {
            this.text = string;
            this.startLocation = startLocation;
            this.endLocation = endLocation;
            this.charSpaceWidth = charSpaceWidth;

   = ri;

            Vector oVector = endLocation.subtract(startLocation);
            if (oVector.length() == 0) {
                oVector = new Vector(1, 0, 0);
            this.orientationVector = oVector.normalize();
            this.orientationMagnitude =
                    (int) (Math.atan2(this.orientationVector.get(Vector.I2), this.orientationVector.get(Vector.I1)) * 1000);

            final Vector origin = new Vector(0, 0, 1);
            this.distPerpendicular = (int) startLocation.subtract(origin).cross(this.orientationVector).get(Vector.I3);

            this.distParallelStart =;
            this.distParallelEnd =;

        public Vector getStartLocation() {
            return this.startLocation;

        public Vector getEndLocation() {
            return this.endLocation;

        public String getText() {
            return this.text;

        public float getCharSpaceWidth() {
            return this.charSpaceWidth;

        private void printDiagnostics() {
            System.out.println("Text (@" + this.startLocation + " -> " + this.endLocation + "): " + this.text);
            System.out.println("orientationMagnitude: " + this.orientationMagnitude);
            System.out.println("distPerpendicular: " + this.distPerpendicular);
            System.out.println("distParallel: " + this.distParallelStart);

        public boolean sameLine(final TextChunk as) {
            if (this.orientationMagnitude != as.orientationMagnitude) {
                return false;
            if (this.distPerpendicular != as.distPerpendicular) {
                return false;
            return true;

        public float distanceFromEndOf(final TextChunk other) {
            final float distance = this.distParallelStart - other.distParallelEnd;
            return distance;

        public float myDistanceFromEndOf(final TextChunk other) {
            final float distance = this.distParallelStart - other.distParallelEnd;
            return distance;

        public int compareTo(final TextChunk rhs) {
            if (this == rhs) {
                return 0; // not really needed, but just in case

            int rslt;
            rslt = compareInts(this.orientationMagnitude, rhs.orientationMagnitude);
            if (rslt != 0) {
                return rslt;

            rslt = compareInts(this.distPerpendicular, rhs.distPerpendicular);
            if (rslt != 0) {
                return rslt;

            return, rhs.distParallelStart);

        private static int compareInts(final int int1, final int int2) {
            return int1 == int2 ? 0 : int1 < int2 ? -1 : 1;

        public TextRenderInfo getInfo() {


    public void renderImage(final ImageRenderInfo renderInfo) {
        // do nothing

    public static interface TextChunkFilter {

        public boolean accept(TextChunk textChunk);



                final Float dist = chunk.distanceFromEndOf(lastChunk)/3;
                for(int i = 0; i<Math.round(dist); i++) {
                    sb.append(' ');

到 getResultantText 方法以用空格扩展间隙。 但问题是:


this: this:

有谁知道如何计算距离的更好值或值?我认为这是因为原始字体类型是 ArialMT 并且我的编辑器是 courier,但要使用此表,建议我可以将表格拆分到正确的位置以获取数据。由于值 usw 的浮动开始和结束,这很困难。



            final Float dist = chunk.distanceFromEndOf(lastChunk)/3;
            for(int i = 0; i<Math.round(dist); i++) {
                sb.append(' ');

是它假设当前位置StringBuffer完全对应于结尾lastChunk假设字符宽度为 3 个用户空间单位。情况不一定如此,通常每次添加字符都会破坏以前的对应关系。例如。使用比例字体时,这两行的宽度不同:




因此,你必须看看哪里chunk starts 相对于左页面边框并相应地向缓冲区添加空格。



public String getResultantText(TextChunkFilter chunkFilter){
    if (DUMP_STATE) dumpState();
    List<TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter);

    int startOfLinePosition = 0;
    StringBuffer sb = new StringBuffer();
    TextChunk lastChunk = null;
    for (TextChunk chunk : filteredTextChunks) {

        if (lastChunk == null){
            insertSpaces(sb, startOfLinePosition, chunk.distParallelStart, false);
        } else {
            if (chunk.sameLine(lastChunk))
                if (isChunkAtWordBoundary(chunk, lastChunk))
                    insertSpaces(sb, startOfLinePosition, chunk.distParallelStart, !startsWithSpace(chunk.text) && !endsWithSpace(lastChunk.text));
            } else {
                startOfLinePosition = sb.length();
                insertSpaces(sb, startOfLinePosition, chunk.distParallelStart, false);
        lastChunk = chunk;

    return sb.toString();       

void insertSpaces(StringBuffer sb, int startOfLinePosition, float chunkStart, boolean spaceRequired)
    int indexNow = sb.length() - startOfLinePosition;
    int indexToBe = (int)((chunkStart - pageLeft) / fixedCharWidth);
    int spacesToInsert = indexToBe - indexNow;
    if (spacesToInsert < 1 && spaceRequired)
        spacesToInsert = 1;
    for (; spacesToInsert > 0; spacesToInsert--)
        sb.append(' ');

public float pageLeft = 0;
public float fixedCharWidth = 6;

pageLeft是左页面边框的坐标。策略不知道这一点,因此必须明确告知;但在许多情况下,0 是正确的值。


fixedCharWidth是假定的字符宽度。根据相关 PDF 中的书写情况,不同的值可能更合适。在你的例子中,值 3 似乎比我的值 6 更好。


  • 它假设没有跨越多个表列的文本块。这种假设通常是正确的,但我见过奇怪的 PDF,其中正常的字间距是在某个偏移处使用单独的文本块实现的,但列间距由单个块中的单个空格字符表示(跨越一栏的结尾和下一栏的开始)!该空格字符的宽度已由 PDF 图形状态的字间距设置控制。

  • 它忽略了不同数量的垂直空间。


