BruteForceParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSObjectKey;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
import org.apache.pdfbox.pdmodel.encryption.ProtectionPolicy;
import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
/**
* Brute force parser to be used as last resort if a malformed pdf can't be read.
*/
public class BruteForceParser extends COSParser
{
private static final char[] XREF_TABLE = new char[] { 'x', 'r', 'e', 'f' };
private static final char[] XREF_STREAM = new char[] { '/', 'X', 'R', 'e', 'f' };
private static final long MINIMUM_SEARCH_OFFSET = 6;
/**
* EOF-marker.
*/
private static final char[] EOF_MARKER = new char[] { '%', '%', 'E', 'O', 'F' };
/**
* obj-marker.
*/
private static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
/**
* trailer-marker.
*/
private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' };
/**
* ObjStream-marker.
*/
private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 'S', 't', 'm' };
private static final Log LOG = LogFactory.getLog(BruteForceParser.class);
/**
* Contains all found objects of a brute force search.
*/
private final Map<COSObjectKey, Long> bfSearchCOSObjectKeyOffsets = new HashMap<>();
/**
* Constructor. Triggers a brute force search for all objects of the document.
*
* @param source input representing the pdf.
* @param document the corresponding COS document
*
*/
public BruteForceParser(RandomAccessRead source, COSDocument document) throws IOException
{
super(source);
this.document = document;
bfSearchForObjects();
}
/**
* Returns all found objects of a brute force search.
*
* @return map containing all found objects of a brute force search
*/
protected Map<COSObjectKey, Long> getBFCOSObjectOffsets()
{
return bfSearchCOSObjectKeyOffsets;
}
/**
* Brute force search for every object in the pdf.
*
* @throws IOException if something went wrong
*/
private void bfSearchForObjects() throws IOException
{
long lastEOFMarker = bfSearchForLastEOFMarker();
long originOffset = source.getPosition();
long currentOffset = MINIMUM_SEARCH_OFFSET;
long lastObjectId = Long.MIN_VALUE;
int lastGenID = Integer.MIN_VALUE;
long lastObjOffset = Long.MIN_VALUE;
char[] endobjString = "ndo".toCharArray();
char[] endobjRemainingString = "bj".toCharArray();
boolean endOfObjFound = false;
do
{
source.seek(currentOffset);
int nextChar = source.read();
currentOffset++;
if (isWhitespace(nextChar) && isString(OBJ_MARKER))
{
long tempOffset = currentOffset - 2;
source.seek(tempOffset);
int genID = source.peek();
// is the next char a digit?
if (isDigit(genID))
{
genID -= 48;
tempOffset--;
source.seek(tempOffset);
if (isWhitespace())
{
while (tempOffset > MINIMUM_SEARCH_OFFSET && isWhitespace())
{
source.seek(--tempOffset);
}
boolean objectIDFound = false;
while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
{
source.seek(--tempOffset);
objectIDFound = true;
}
if (objectIDFound)
{
source.read();
long objectId = readObjectNumber();
if (lastObjOffset > 0)
{
// add the former object ID only if there was a subsequent object ID
bfSearchCOSObjectKeyOffsets.put(
new COSObjectKey(lastObjectId, lastGenID), lastObjOffset);
}
lastObjectId = objectId;
lastGenID = genID;
lastObjOffset = tempOffset + 1;
currentOffset += OBJ_MARKER.length - 1;
endOfObjFound = false;
}
}
}
}
// check for "endo" as abbreviation for "endobj", as the pdf may be cut off
// in the middle of the keyword, see PDFBOX-3936.
// We could possibly implement a more intelligent algorithm if necessary
else if (nextChar == 'e' && isString(endobjString))
{
currentOffset += endobjString.length;
source.seek(currentOffset);
if (source.isEOF())
{
endOfObjFound = true;
}
else if (isString(endobjRemainingString))
{
currentOffset += endobjRemainingString.length;
endOfObjFound = true;
}
}
} while (currentOffset < lastEOFMarker && !source.isEOF());
if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset > 0)
{
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
// the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID),
lastObjOffset);
}
// reestablish origin position
source.seek(originOffset);
}
/**
* Search for the offset of the given xref table/stream among those found by a brute force search.
*
* @param xrefOffset the given offset to be searched for
*
* @return the offset of the xref entry
* @throws IOException if something went wrong
*/
protected long bfSearchForXRef(long xrefOffset) throws IOException
{
long newOffset = -1;
// initialize bfSearchXRefTablesOffsets -> not null
List<Long> bfSearchXRefTablesOffsets = bfSearchForXRefTables();
// initialize bfSearchXRefStreamsOffsets -> not null
List<Long> bfSearchXRefStreamsOffsets = bfSearchForXRefStreams();
// TODO to be optimized, this won't work in every case
long newOffsetTable = searchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
// TODO to be optimized, this won't work in every case
long newOffsetStream = searchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
// choose the nearest value
if (newOffsetTable > -1 && newOffsetStream > -1)
{
long differenceTable = xrefOffset - newOffsetTable;
long differenceStream = xrefOffset - newOffsetStream;
if (Math.abs(differenceTable) > Math.abs(differenceStream))
{
newOffset = newOffsetStream;
bfSearchXRefStreamsOffsets.remove(newOffsetStream);
}
else
{
newOffset = newOffsetTable;
bfSearchXRefTablesOffsets.remove(newOffsetTable);
}
}
else if (newOffsetTable > -1)
{
newOffset = newOffsetTable;
bfSearchXRefTablesOffsets.remove(newOffsetTable);
}
else if (newOffsetStream > -1)
{
newOffset = newOffsetStream;
bfSearchXRefStreamsOffsets.remove(newOffsetStream);
}
return newOffset;
}
private long searchNearestValue(List<Long> values, long offset)
{
long newValue = -1;
Long currentDifference = null;
int currentOffsetIndex = -1;
int numberOfOffsets = values.size();
// find the nearest value
for (int i = 0; i < numberOfOffsets; i++)
{
long newDifference = offset - values.get(i);
// find the nearest offset
if (currentDifference == null
|| (Math.abs(currentDifference) > Math.abs(newDifference)))
{
currentDifference = newDifference;
currentOffsetIndex = i;
}
}
if (currentOffsetIndex > -1)
{
newValue = values.get(currentOffsetIndex);
}
return newValue;
}
/**
* Brute force search for all objects streams of a pdf.
*
* @param trailerResolver the trailer resolver of the document
* @param securityHandler security handler to be used to decrypt encrypted documents
* @throws IOException if something went wrong
*/
protected void bfSearchForObjStreams(XrefTrailerResolver trailerResolver,
SecurityHandler<? extends ProtectionPolicy> securityHandler) throws IOException
{
// update security handler
this.securityHandler = securityHandler;
// save origin offset
long originOffset = source.getPosition();
Map<Long, COSObjectKey> bfSearchForObjStreamOffsets = bfSearchForObjStreamOffsets();
// log warning about skipped stream
bfSearchForObjStreamOffsets.entrySet().stream() //
.filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) == null) //
.forEach(o -> LOG.warn(
"Skipped incomplete object stream:" + o.getValue() + " at " + o.getKey()));
// collect all stream offsets
List<Long> objStreamOffsets = bfSearchForObjStreamOffsets.entrySet().stream() //
.filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) != null) //
.filter(o -> o.getKey().equals(bfSearchCOSObjectKeyOffsets.get(o.getValue()))) //
.map(Map.Entry::getKey) //
.collect(Collectors.toList());
// add all found compressed objects to the brute force search result
for (Long offset : objStreamOffsets)
{
source.seek(offset);
long stmObjNumber = readObjectNumber();
int stmGenNumber = readGenerationNumber();
readExpectedString(OBJ_MARKER, true);
COSStream stream = null;
try
{
COSDictionary dict = parseCOSDictionary(false);
stream = parseCOSStream(dict);
if (securityHandler != null)
{
securityHandler.decryptStream(stream, stmObjNumber, stmGenNumber);
}
PDFObjectStreamParser objStreamParser = new PDFObjectStreamParser(stream, document);
Map<Long, Integer> objectNumbers = objStreamParser.readObjectNumbers();
Map<COSObjectKey, Long> xrefOffset = trailerResolver.getXrefTable();
for (Long objNumber : objectNumbers.keySet())
{
COSObjectKey objKey = new COSObjectKey(objNumber, 0);
Long existingOffset = bfSearchCOSObjectKeyOffsets.get(objKey);
if (existingOffset != null && existingOffset < 0)
{
// translate stream object key to its offset
COSObjectKey objStmKey = new COSObjectKey(Math.abs(existingOffset), 0);
existingOffset = bfSearchCOSObjectKeyOffsets.get(objStmKey);
}
if (existingOffset == null || offset > existingOffset)
{
bfSearchCOSObjectKeyOffsets.put(objKey, -stmObjNumber);
xrefOffset.put(objKey, -stmObjNumber);
}
}
}
catch (IOException exception)
{
LOG.debug("Skipped corrupt stream: (" + stmObjNumber + " 0 at offset " + offset,
exception);
}
finally
{
if (stream != null)
{
stream.close();
}
}
}
// restore origin offset
source.seek(originOffset);
}
/**
* Brute force search for all trailer marker.
*
* @param trailer dictionary to be used as trailer dictionary
*
* @throws IOException if something went wrong
*/
private boolean bfSearchForTrailer(COSDictionary trailer) throws IOException
{
long originOffset = source.getPosition();
source.seek(MINIMUM_SEARCH_OFFSET);
// search for trailer marker
long trailerOffset = findString(TRAILER_MARKER);
while (trailerOffset != -1)
{
try
{
boolean rootFound = false;
boolean infoFound = false;
skipSpaces();
COSDictionary trailerDict = parseCOSDictionary(true);
COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
if (rootObj != null)
{
// check if the dictionary can be dereferenced and is the one we are looking for
COSBase rootDict = rootObj.getObject();
if (rootDict instanceof COSDictionary && isCatalog((COSDictionary) rootDict))
{
rootFound = true;
}
}
COSObject infoObj = trailerDict.getCOSObject(COSName.INFO);
if (infoObj != null)
{
// check if the dictionary can be dereferenced and is the one we are looking for
COSBase infoDict = infoObj.getObject();
if (infoDict instanceof COSDictionary && isInfo((COSDictionary) infoDict))
{
infoFound = true;
}
}
if (rootFound && infoFound)
{
trailer.setItem(COSName.ROOT, rootObj);
trailer.setItem(COSName.INFO, infoObj);
if (trailerDict.containsKey(COSName.ENCRYPT))
{
COSObject encObj = trailerDict.getCOSObject(COSName.ENCRYPT);
// check if the dictionary can be dereferenced
// TODO check if the dictionary is an encryption dictionary?
if (encObj != null && encObj.getObject() instanceof COSDictionary)
{
trailer.setItem(COSName.ENCRYPT, encObj);
}
}
if (trailerDict.containsKey(COSName.ID))
{
COSBase idObj = trailerDict.getItem(COSName.ID);
if (idObj instanceof COSArray)
{
trailer.setItem(COSName.ID, idObj);
}
}
return true;
}
}
catch (IOException exception)
{
LOG.debug("An exception occurred during brute force search for trailer - ignoring",
exception);
}
trailerOffset = findString(TRAILER_MARKER);
}
source.seek(originOffset);
return false;
}
/**
* Search for the different parts of the trailer dictionary.
*
* @param trailer dictionary to be used as trailer dictionary
* @return true if the root was found, false if not.
*/
private boolean searchForTrailerItems(COSDictionary trailer)
{
COSObject rootObject = null;
COSObject infoObject = null;
for (Entry<COSObjectKey, Long> entrySet : bfSearchCOSObjectKeyOffsets.entrySet())
{
COSObjectKey currentKey = entrySet.getKey();
COSObject cosObject = document.getObjectFromPool(currentKey);
COSBase baseObject = cosObject.getObject();
if (!(baseObject instanceof COSDictionary))
{
continue;
}
COSDictionary dictionary = (COSDictionary) baseObject;
// document catalog
if (isCatalog(dictionary))
{
rootObject = compareCOSObjects(cosObject, entrySet.getValue(), rootObject);
}
// info dictionary
else if (isInfo(dictionary))
{
infoObject = compareCOSObjects(cosObject, entrySet.getValue(), infoObject);
}
// encryption dictionary, if existing, is lost
// We can't run "Algorithm 2" from PDF specification because of missing ID
}
if (rootObject != null)
{
trailer.setItem(COSName.ROOT, rootObject);
}
if (infoObject != null)
{
trailer.setItem(COSName.INFO, infoObject);
}
return rootObject != null;
}
private COSObject compareCOSObjects(COSObject newObject, Long newOffset,
COSObject currentObject)
{
if (currentObject != null && currentObject.getKey() != null)
{
COSObjectKey currentKey = currentObject.getKey();
COSObjectKey newKey = newObject.getKey();
// check if the current object is an updated version of the previous found object
if (currentKey.getNumber() == newKey.getNumber())
{
return currentKey.getGeneration() < newKey.getGeneration() ? newObject
: currentObject;
}
// most likely the object with the bigger offset is the newer one
Long currentOffset = document.getXrefTable().get(currentKey);
return currentOffset != null && newOffset > currentOffset ? newObject : currentObject;
}
return newObject;
}
/**
* Brute force search for the last EOF marker.
*
* @throws IOException if something went wrong
*/
private long bfSearchForLastEOFMarker() throws IOException
{
long lastEOFMarker = -1;
long originOffset = source.getPosition();
source.seek(MINIMUM_SEARCH_OFFSET);
long tempMarker = findString(EOF_MARKER);
while (tempMarker != -1)
{
try
{
// check if the following data is some valid pdf content
// which most likely indicates that the pdf is linearized,
// updated or just cut off somewhere in the middle
skipSpaces();
if (!isString(XREF_TABLE))
{
readObjectNumber();
readGenerationNumber();
}
}
catch (IOException exception)
{
// save the EOF marker as the following data is most likely some garbage
LOG.debug("An exception occurred during brute force for last EOF - ignoring",
exception);
lastEOFMarker = tempMarker;
}
tempMarker = findString(EOF_MARKER);
}
source.seek(originOffset);
// no EOF marker found
if (lastEOFMarker == -1)
{
lastEOFMarker = Long.MAX_VALUE;
}
return lastEOFMarker;
}
/**
* Search for all offsets of object streams within the given pdf
*
* @return a map of all offsets for object streams
* @throws IOException if something went wrong
*/
private Map<Long, COSObjectKey> bfSearchForObjStreamOffsets() throws IOException
{
HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new HashMap<>();
source.seek(MINIMUM_SEARCH_OFFSET);
char[] string = " obj".toCharArray();
// search for object stream marker
long positionObjStream = findString(OBJ_STREAM);
while (positionObjStream != -1)
{
// search backwards for the beginning of the object
long newOffset = -1;
boolean objFound = false;
for (int i = 1; i < 40 && !objFound; i++)
{
long currentOffset = positionObjStream - (i * 10);
if (currentOffset > 0)
{
source.seek(currentOffset);
for (int j = 0; j < 10; j++)
{
if (isString(string))
{
long tempOffset = currentOffset - 1;
source.seek(tempOffset);
int genID = source.peek();
// is the next char a digit?
if (isDigit(genID))
{
tempOffset--;
source.seek(tempOffset);
if (isSpace())
{
int length = 0;
source.seek(--tempOffset);
while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
{
source.seek(--tempOffset);
length++;
}
if (length > 0)
{
source.read();
newOffset = source.getPosition();
long objNumber = readObjectNumber();
int genNumber = readGenerationNumber();
COSObjectKey streamObjectKey = new COSObjectKey(objNumber,
genNumber);
bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
}
}
}
LOG.debug("Dictionary start for object stream -> " + newOffset);
objFound = true;
break;
}
else
{
currentOffset++;
source.read();
}
}
}
}
source.seek(positionObjStream + OBJ_STREAM.length);
positionObjStream = findString(OBJ_STREAM);
}
return bfSearchObjStreamsOffsets;
}
/**
* Brute force search for all xref entries (tables).
*
* @throws IOException if something went wrong
*/
private List<Long> bfSearchForXRefTables() throws IOException
{
List<Long> bfSearchXRefTablesOffsets = new ArrayList<>();
// a pdf may contain more than one xref entry
source.seek(MINIMUM_SEARCH_OFFSET);
// search for xref tables
long newOffset = findString(XREF_TABLE);
while (newOffset != -1)
{
source.seek(newOffset - 1);
// ensure that we don't read "startxref" instead of "xref"
if (isWhitespace())
{
bfSearchXRefTablesOffsets.add(newOffset);
}
source.seek(newOffset + 4);
newOffset = findString(XREF_TABLE);
}
return bfSearchXRefTablesOffsets;
}
/**
* Brute force search for all /XRef entries (streams).
*
* @throws IOException if something went wrong
*/
private List<Long> bfSearchForXRefStreams() throws IOException
{
List<Long> bfSearchXRefStreamsOffsets = new ArrayList<>();
// a pdf may contain more than one /XRef entry
source.seek(MINIMUM_SEARCH_OFFSET);
// search for XRef streams
String objString = " obj";
char[] string = objString.toCharArray();
long xrefOffset = findString(XREF_STREAM);
while (xrefOffset != -1)
{
// search backwards for the beginning of the stream
long newOffset = -1;
boolean objFound = false;
for (int i = 1; i < 40 && !objFound; i++)
{
long currentOffset = xrefOffset - (i * 10);
if (currentOffset > 0)
{
source.seek(currentOffset);
for (int j = 0; j < 10; j++)
{
if (isString(string))
{
long tempOffset = currentOffset - 1;
source.seek(tempOffset);
int genID = source.peek();
// is the next char a digit?
if (isDigit(genID))
{
tempOffset--;
source.seek(tempOffset);
if (isSpace())
{
int length = 0;
source.seek(--tempOffset);
while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
{
source.seek(--tempOffset);
length++;
}
if (length > 0)
{
source.read();
newOffset = source.getPosition();
}
}
}
LOG.debug("Fixed reference for xref stream " + xrefOffset + " -> "
+ newOffset);
objFound = true;
break;
}
else
{
currentOffset++;
source.read();
}
}
}
}
if (newOffset > -1)
{
bfSearchXRefStreamsOffsets.add(newOffset);
}
source.seek(xrefOffset + 5);
xrefOffset = findString(XREF_STREAM);
}
return bfSearchXRefStreamsOffsets;
}
/**
* Tell if the dictionary is an info dictionary.
*
* @param dictionary the dictionary to be checked
* @return true if the given dictionary is an info dictionary
*/
private boolean isInfo(COSDictionary dictionary)
{
if (dictionary.containsKey(COSName.PARENT) || dictionary.containsKey(COSName.A)
|| dictionary.containsKey(COSName.DEST))
{
return false;
}
if (!dictionary.containsKey(COSName.MOD_DATE) && !dictionary.containsKey(COSName.TITLE)
&& !dictionary.containsKey(COSName.AUTHOR)
&& !dictionary.containsKey(COSName.SUBJECT)
&& !dictionary.containsKey(COSName.KEYWORDS)
&& !dictionary.containsKey(COSName.CREATOR)
&& !dictionary.containsKey(COSName.PRODUCER)
&& !dictionary.containsKey(COSName.CREATION_DATE))
{
return false;
}
return true;
}
/**
* Tell if the dictionary is a PDF or FDF catalog.
*
* @param dictionary
* @return true if the given dictionary is a root dictionary
*/
private boolean isCatalog(COSDictionary dictionary)
{
return COSName.CATALOG.equals(dictionary.getCOSName(COSName.TYPE))
|| dictionary.containsKey(COSName.FDF);
}
/**
* Search for the given string. The search starts at the current position and returns the start position if the
* string was found. -1 is returned if there isn't any further occurrence of the given string. After returning the
* current position is either the end of the string or the end of the input.
*
* @param string the string to be searched
* @return the start position of the found string
* @throws IOException if something went wrong
*/
private long findString(char[] string) throws IOException
{
long position = -1L;
int stringLength = string.length;
int counter = 0;
int readChar = source.read();
while (readChar != -1)
{
if (readChar == string[counter])
{
if (counter == 0)
{
position = source.getPosition() - 1;
}
counter++;
if (counter == stringLength)
{
return position;
}
}
else if (counter > 0)
{
counter = 0;
position = -1L;
continue;
}
readChar = source.read();
}
return position;
}
/**
* Rebuild the trailer dictionary if startxref can't be found.
*
* @param trailerResolver the trailer resolver of the document
* @param securityHandler security handler to be used to decrypt encrypted documents
* @return the rebuild trailer dictionary
*
* @throws IOException if something went wrong
*/
protected COSDictionary rebuildTrailer(XrefTrailerResolver trailerResolver,
SecurityHandler<? extends ProtectionPolicy> securityHandler) throws IOException
{
// update security handler
this.securityHandler = securityHandler;
// reset trailer resolver
trailerResolver.reset();
// use the found objects to rebuild the trailer resolver
trailerResolver.nextXrefObj(0, XRefType.TABLE);
bfSearchCOSObjectKeyOffsets.forEach(trailerResolver::setXRef);
trailerResolver.setStartxref(0);
COSDictionary trailer = trailerResolver.getTrailer();
document.setTrailer(trailer);
boolean searchForObjStreamsDone = false;
if (!bfSearchForTrailer(trailer) && !searchForTrailerItems(trailer))
{
// root entry wasn't found, maybe it is part of an object stream
// brute force search for all object streams.
bfSearchForObjStreams(trailerResolver, securityHandler);
searchForObjStreamsDone = true;
// search again for the root entry
searchForTrailerItems(trailer);
}
// prepare decryption if necessary
prepareDecryption();
if (!searchForObjStreamsDone)
{
// brute force search for all object streams.
bfSearchForObjStreams(trailerResolver, securityHandler);
}
return trailer;
}
}