COSDocument.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.cos;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.io.ScratchFile;
/**
* This is the in-memory representation of the PDF document. You need to call
* close() on this object when you are done using it!!
*
* @author Ben Litchfield
*
*/
public class COSDocument extends COSBase implements Closeable
{
/**
* Log instance.
*/
private static final Log LOG = LogFactory.getLog(COSDocument.class);
private float version = 1.4f;
/**
* Maps ObjectKeys to a COSObject. Note that references to these objects
* are also stored in COSDictionary objects that map a name to a specific object.
*/
private final Map<COSObjectKey, COSObject> objectPool =
new HashMap<>();
/**
* Maps object and generation id to object byte offsets.
*/
private final Map<COSObjectKey, Long> xrefTable =
new HashMap<>();
/**
* List containing all streams which are created when creating a new pdf.
*/
private final List<COSStream> streams = new ArrayList<>();
/**
* Document trailer dictionary.
*/
private COSDictionary trailer;
/**
* Signal that document is already decrypted.
*/
private boolean isDecrypted = false;
private long startXref;
private boolean closed = false;
private boolean isXRefStream;
private ScratchFile scratchFile;
/**
* Used for incremental saving, to avoid XRef object numbers from being reused.
*/
private long highestXRefObjectNumber;
private final ICOSParser parser;
private final COSDocumentState documentState = new COSDocumentState();
/**
* Constructor. Uses main memory to buffer PDF streams.
*/
public COSDocument()
{
this(MemoryUsageSetting.setupMainMemoryOnly());
}
/**
* Constructor. Uses main memory to buffer PDF streams.
*
* @param parser Parser to be used to parse the document on demand
*/
public COSDocument(ICOSParser parser)
{
this(MemoryUsageSetting.setupMainMemoryOnly(), parser);
}
/**
* Constructor that will use the provided memory settings for storage of the PDF streams.
*
* @param memUsageSetting defines how memory is used for buffering PDF streams
*
*/
public COSDocument(MemoryUsageSetting memUsageSetting)
{
this(memUsageSetting, null);
}
/**
* Constructor that will use the provided memory settings for storage of the PDF streams.
*
* @param memUsageSetting defines how memory is used for buffering PDF streams
* @param parser Parser to be used to parse the document on demand
*
*/
public COSDocument(MemoryUsageSetting memUsageSetting, ICOSParser parser)
{
try
{
if (memUsageSetting != null)
{
scratchFile = new ScratchFile(memUsageSetting);
}
else
{
scratchFile = ScratchFile.getMainMemoryOnlyInstance();
}
}
catch (IOException ioe)
{
LOG.warn("Error initializing scratch file: " + ioe.getMessage()
+ ". Fall back to main memory usage only.", ioe);
scratchFile = ScratchFile.getMainMemoryOnlyInstance();
}
this.parser = parser;
}
/**
* Creates a new COSStream using the current configuration for scratch files.
*
* @return the new COSStream
*/
public COSStream createCOSStream()
{
COSStream stream = new COSStream(scratchFile);
// collect all COSStreams so that they can be closed when closing the COSDocument.
// This is limited to newly created pdfs as all COSStreams of an existing pdf are
// collected within the map objectPool
streams.add(stream);
return stream;
}
/**
* Creates a new COSStream using the current configuration for scratch files. Not for public use.
* Only COSParser should call this method.
*
* @param dictionary the corresponding dictionary
* @param startPosition the start position within the source
* @param streamLength the stream length
* @return the new COSStream
* @throws IOException if the random access view can't be read
*/
public COSStream createCOSStream(COSDictionary dictionary, long startPosition,
long streamLength) throws IOException
{
COSStream stream = new COSStream(scratchFile,
parser.createRandomAccessReadView(startPosition, streamLength));
dictionary.forEach(stream::setItem);
return stream;
}
/**
* Get the dictionary containing the linearization information if the pdf is linearized.
*
* @return the dictionary containing the linearization information
*/
public COSDictionary getLinearizedDictionary()
{
// get all keys with a positive offset in ascending order, as the linearization dictionary shall be the first
// within the pdf
List<COSObjectKey> objectKeys = xrefTable.entrySet().stream() //
.filter(e -> e.getValue() > 0L) //
.sorted(Comparator.comparing(Entry::getValue)) //
.map(Entry::getKey) //
.collect(Collectors.toList());
for (COSObjectKey objectKey : objectKeys)
{
COSObject objectFromPool = getObjectFromPool(objectKey);
COSBase realObject = objectFromPool.getObject();
if (realObject instanceof COSDictionary)
{
COSDictionary dic = (COSDictionary) realObject;
if (dic.getItem(COSName.LINEARIZED) != null)
{
return dic;
}
}
}
return null;
}
/**
* This will get all dictionaries objects by type.
*
* @param type The type of the object.
*
* @return This will return all objects with the specified type.
*/
public List<COSObject> getObjectsByType(COSName type)
{
return getObjectsByType(type, null);
}
/**
* This will get all dictionaries objects by type.
*
* @param type1 The first possible type of the object, mandatory.
* @param type2 The second possible type of the object, usually an abbreviation, optional.
*
* @return This will return all objects with the specified type(s).
*/
public List<COSObject> getObjectsByType(COSName type1, COSName type2)
{
List<COSObject> retval = new ArrayList<>();
for (COSObjectKey objectKey : xrefTable.keySet())
{
COSObject objectFromPool = getObjectFromPool(objectKey);
COSBase realObject = objectFromPool.getObject();
if( realObject instanceof COSDictionary )
{
COSName dictType = ((COSDictionary) realObject).getCOSName(COSName.TYPE);
if (type1.equals(dictType) || (type2 != null && type2.equals(dictType)))
{
retval.add(objectFromPool);
}
}
}
return retval;
}
/**
* This will set the header version of this PDF document.
*
* @param versionValue The version of the PDF document.
*/
public void setVersion( float versionValue )
{
version = versionValue;
}
/**
* This will get the version extracted from the header of this PDF document.
*
* @return The header version.
*/
public float getVersion()
{
return version;
}
/**
* Signals that the document is decrypted completely.
*/
public void setDecrypted()
{
isDecrypted = true;
}
/**
* Indicates if a encrypted pdf is already decrypted after parsing.
*
* @return true indicates that the pdf is decrypted.
*/
public boolean isDecrypted()
{
return isDecrypted;
}
/**
* This will tell if this is an encrypted document.
*
* @return true If this document is encrypted.
*/
public boolean isEncrypted()
{
return trailer != null && trailer.getCOSDictionary(COSName.ENCRYPT) != null;
}
/**
* This will get the encryption dictionary if the document is encrypted or null if the document
* is not encrypted.
*
* @return The encryption dictionary.
*/
public COSDictionary getEncryptionDictionary()
{
return trailer.getCOSDictionary(COSName.ENCRYPT);
}
/**
* This will set the encryption dictionary, this should only be called when
* encrypting the document.
*
* @param encDictionary The encryption dictionary.
*/
public void setEncryptionDictionary( COSDictionary encDictionary )
{
trailer.setItem( COSName.ENCRYPT, encDictionary );
}
/**
* This will get the document ID.
*
* @return The document id.
*/
public COSArray getDocumentID()
{
return getTrailer().getCOSArray(COSName.ID);
}
/**
* This will set the document ID.
*
* @param id The document id.
*/
public void setDocumentID( COSArray id )
{
getTrailer().setItem(COSName.ID, id);
}
/**
* This will get the document trailer.
*
* @return the document trailer dict
*/
public COSDictionary getTrailer()
{
return trailer;
}
/**
* // MIT added, maybe this should not be supported as trailer is a persistence construct.
* This will set the document trailer.
*
* @param newTrailer the document trailer dictionary
*/
public void setTrailer(COSDictionary newTrailer)
{
trailer = newTrailer;
trailer.getUpdateState().setOriginDocumentState(documentState);
}
/**
* Internal PDFBox use only. Get the object number of the highest XRef stream. This is needed to
* avoid reusing such a number in incremental saving.
*
* @return The object number of the highest XRef stream, or 0 if there was no XRef stream.
*/
public long getHighestXRefObjectNumber()
{
return highestXRefObjectNumber;
}
/**
* Internal PDFBox use only. Sets the object number of the highest XRef stream. This is needed
* to avoid reusing such a number in incremental saving.
*
* @param highestXRefObjectNumber The object number of the highest XRef stream.
*/
public void setHighestXRefObjectNumber(long highestXRefObjectNumber)
{
this.highestXRefObjectNumber = highestXRefObjectNumber;
}
/**
* visitor pattern double dispatch method.
*
* @param visitor The object to notify when visiting this object.
* @throws IOException If an error occurs while visiting this object.
*/
@Override
public void accept(ICOSVisitor visitor) throws IOException
{
visitor.visitFromDocument(this);
}
/**
* This will close all storage and delete the tmp files.
*
* @throws IOException If there is an error close resources.
*/
@Override
public void close() throws IOException
{
if (closed)
{
return;
}
// Make sure that:
// - first Exception is kept
// - all COSStreams are closed
// - ScratchFile is closed
// - there's a way to see which errors occurred
IOException firstException = null;
// close all open I/O streams
for (COSObject object : objectPool.values())
{
if (!object.isObjectNull())
{
COSBase cosObject = object.getObject();
if (cosObject instanceof COSStream)
{
firstException = IOUtils.closeAndLogException((COSStream) cosObject, LOG,
"COSStream", firstException);
}
}
}
for (COSStream stream : streams)
{
firstException = IOUtils.closeAndLogException(stream, LOG, "COSStream", firstException);
}
if (scratchFile != null)
{
firstException = IOUtils.closeAndLogException(scratchFile, LOG, "ScratchFile", firstException);
}
closed = true;
// rethrow first exception to keep method contract
if (firstException != null)
{
throw firstException;
}
}
/**
* Returns true if this document has been closed.
*/
public boolean isClosed()
{
return closed;
}
/**
* This will get an object from the pool.
*
* @param key The object key.
*
* @return The object in the pool or a new one if it has not been parsed yet.
*/
public COSObject getObjectFromPool(COSObjectKey key)
{
COSObject obj = null;
if( key != null )
{
// make "proxy" object if this was a forward reference
obj = objectPool.computeIfAbsent(key, k -> new COSObject(k, parser));
}
return obj;
}
/**
* Populate XRef HashMap with given values.
* Each entry maps ObjectKeys to byte offsets in the file.
* @param xrefTableValues xref table entries to be added
*/
public void addXRefTable( Map<COSObjectKey, Long> xrefTableValues )
{
xrefTable.putAll( xrefTableValues );
}
/**
* Returns the xrefTable which is a mapping of ObjectKeys
* to byte offsets in the file.
* @return mapping of ObjectsKeys to byte offsets
*/
public Map<COSObjectKey, Long> getXrefTable()
{
return xrefTable;
}
/**
* This method set the startxref value of the document. This will only
* be needed for incremental updates.
*
* @param startXrefValue the value for startXref
*/
public void setStartXref(long startXrefValue)
{
startXref = startXrefValue;
}
/**
* Return the startXref Position of the parsed document. This will only be needed for incremental updates.
*
* @return a long with the old position of the startxref
*/
public long getStartXref()
{
return startXref;
}
/**
* Determines if the trailer is a XRef stream or not.
*
* @return true if the trailer is a XRef stream
*/
public boolean isXRefStream()
{
return isXRefStream;
}
/**
* Sets isXRefStream to the given value. You need to take care that the version of your PDF is
* 1.5 or higher.
*
* @param isXRefStreamValue the new value for isXRefStream
*/
public void setIsXRefStream(boolean isXRefStreamValue)
{
isXRefStream = isXRefStreamValue;
}
/**
* Returns the {@link COSDocumentState} of this {@link COSDocument}.
*
* @return The {@link COSDocumentState} of this {@link COSDocument}.
* @see COSDocumentState
*/
public COSDocumentState getDocumentState()
{
return documentState;
}
}