PDFXrefStreamParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.pdfparser;

import java.io.IOException;

import java.util.Arrays;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.cos.COSObjectKey;

/**
 * This will parse a PDF 1.5 (or better) Xref stream and
 * extract the xref information from the stream.
 *
 *  @author Justin LeFebvre
 */
public class PDFXrefStreamParser extends BaseParser
{
    private final int[] w = new int[3];
    private ObjectNumbers objectNumbers = null;

    /**
     * Constructor.
     *
     * @param stream The stream to parse.
     * @param document The document for the current parsing.
     *
     * @throws IOException If there is an error initializing the stream.
     */
    public PDFXrefStreamParser(COSStream stream, COSDocument document)
            throws IOException
    {
        super(stream.createView());
        this.document = document;
        try
        {
            initParserValues(stream);
        }
        catch (IOException exception)
        {
            close();
            throw exception;
        }
    }

    private void initParserValues(COSStream stream) throws IOException
    {
        COSArray wArray = stream.getCOSArray(COSName.W);
        if (wArray == null)
        {
            throw new IOException("/W array is missing in Xref stream");
        }
        if (wArray.size() != 3)
        {
            throw new IOException(
                    "Wrong number of values for /W array in XRef: " + Arrays.toString(w));
        }
        for (int i = 0; i < 3; i++)
        {
            w[i] = wArray.getInt(i, 0);
        }
        if (w[0] < 0 || w[1] < 0 || w[2] < 0)
        {
            throw new IOException("Incorrect /W array in XRef: " + Arrays.toString(w));
        }

        COSArray indexArray = stream.getCOSArray(COSName.INDEX);
        if (indexArray == null)
        {
            // If /Index doesn't exist, we will use the default values.
            indexArray = new COSArray();
            indexArray.add(COSInteger.ZERO);
            indexArray.add(COSInteger.get(stream.getInt(COSName.SIZE, 0)));
        }
        if (indexArray.size() == 0 || indexArray.size() % 2 == 1)
        {
            throw new IOException(
                    "Wrong number of values for /Index array in XRef: " + Arrays.toString(w));
        }
        // create an Iterator for all object numbers using the index array
        objectNumbers = new ObjectNumbers(indexArray);
    }

    private void close() throws IOException
    {
        if (source != null)
        {
            source.close();
        }
        document = null;
        objectNumbers = null;
    }

    /**
     * Parses through the unfiltered stream and populates the xrefTable HashMap.
     * 
     * @param resolver resolver to read the xref/trailer information
     * @throws IOException If there is an error while parsing the stream.
     */
    public void parse(XrefTrailerResolver resolver) throws IOException
    {
        byte[] currLine = new byte[w[0] + w[1] + w[2]];
        while (!isEOF() && objectNumbers.hasNext())
        {
            readNextValue(currLine);
            // get the current objID
            long objID = objectNumbers.next();
            // default value is 1 if w[0] == 0, otherwise parse first field
            int type = w[0] == 0 ? 1 : (int) parseValue(currLine, 0, w[0]);
            // Skip free objects (type 0) and invalid types
            if (type == 0)
            {
                continue;
            }
            // second field holds the offset (type 1) or the object stream number (type 2)
            long offset = parseValue(currLine, w[0], w[1]);
            // third field holds the generation number for type 1 entries
            int genNum = type == 1 ? (int) parseValue(currLine, w[0] + w[1], w[2]) : 0;
            COSObjectKey objKey = new COSObjectKey(objID, genNum);
            if (type == 1)
            {
                resolver.setXRef(objKey, offset);
            }
            else
            {
                // For XRef aware parsers we have to know which objects contain object streams. We will store this
                // information in normal xref mapping table but add object stream number with minus sign in order to
                // distinguish from file offsets
                resolver.setXRef(objKey, -offset);
            }
        }
        close();
    }

    private void readNextValue(byte[] value) throws IOException
    {
        int remainingBytes = value.length;
        int amountRead;
        while ((amountRead = source.read(value, value.length - remainingBytes, remainingBytes)) > 0)
        {
            remainingBytes -= amountRead;
        }
    }

    private long parseValue(byte[] data, int start, int length)
    {
        long value = 0;
        for (int i = 0; i < length; i++)
        {
            value += ((long) data[i + start] & 0x00ff) << ((length - i - 1) * 8);
        }
        return value;
    }

    private static class ObjectNumbers implements Iterator<Long>
    {
        private final long[] start;
        private final long[] end;
        private int currentRange = 0;
        private long currentEnd = 0;
        private long currentNumber = 0;
        private long maxValue = 0;

        private ObjectNumbers(COSArray indexArray) throws IOException
        {
            start = new long[indexArray.size() / 2];
            end = new long[start.length];
            int counter = 0;
            Iterator<COSBase> indexIter = indexArray.iterator();
            while (indexIter.hasNext())
            {
                COSBase base = indexIter.next();
                if (!(base instanceof COSInteger))
                {
                    throw new IOException("Xref stream must have integer in /Index array");
                }
                long startValue = ((COSInteger) base).longValue();
                if (!indexIter.hasNext())
                {
                    break;
                }
                base = indexIter.next();
                if (!(base instanceof COSInteger))
                {
                    throw new IOException("Xref stream must have integer in /Index array");
                }
                long sizeValue = ((COSInteger) base).longValue();
                start[counter] = startValue;
                end[counter++] = startValue + sizeValue;
            }
            currentNumber = start[0];
            currentEnd = end[0];
            maxValue = end[counter - 1];
        }

        @Override
        public boolean hasNext()
        {
            return currentNumber < maxValue;
        }

        @Override
        public Long next()
        {
            if (currentNumber >= maxValue)
            {
                throw new NoSuchElementException();
            }
            if (currentNumber < currentEnd)
            {
                return currentNumber++;
            }
            currentNumber = start[++currentRange];
            currentEnd = end[currentRange];
            return currentNumber++;
        }
    }

}