ToUnicodeWriter.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pdfbox.pdmodel.font;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.pdfbox.util.Hex;

/**
 * Writes ToUnicode Mapping Files.
 *
 * @author John Hewson
 */
final class ToUnicodeWriter
{
    private final Map<Integer, String> cidToUnicode = new TreeMap<>();
    private int wMode;

    /**
     * To test corner case of PDFBOX-4302.
     */
    static final int MAX_ENTRIES_PER_OPERATOR = 100;

    /**
     * Creates a new ToUnicode CMap writer.
     */
    ToUnicodeWriter()
    {
        this.wMode = 0;
    }

    /**
     * Sets the WMode (writing mode) of this CMap.
     *
     * @param wMode 1 for vertical, 0 for horizontal (default)
     */
    public void setWMode(int wMode)
    {
        this.wMode = wMode;
    }

    /**
     * Adds the given CID to Unicode mapping.
     *
     * @param cid CID
     * @param text Unicode text, up to 512 bytes.
     */
    public void add(int cid, String text)
    {
        if (cid < 0 || cid > 0xFFFF)
        {
            throw new IllegalArgumentException("CID is not valid");
        }

        if (text == null || text.isEmpty())
        {
            throw new IllegalArgumentException("Text is null or empty");
        }

        cidToUnicode.put(cid, text);
    }

    /**
     * Writes the CMap as ASCII to the given output stream.
     *
     * @param out ASCII output stream
     * @throws IOException if the stream could not be written
     */
    public void writeTo(OutputStream out) throws IOException
    {
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.US_ASCII));

        writeLine(writer, "/CIDInit /ProcSet findresource begin");
        writeLine(writer, "12 dict begin\n");

        writeLine(writer, "begincmap");
        writeLine(writer, "/CIDSystemInfo");
        writeLine(writer, "<< /Registry (Adobe)");
        writeLine(writer, "/Ordering (UCS)");
        writeLine(writer, "/Supplement 0");
        writeLine(writer, ">> def\n");

        writeLine(writer, "/CMapName /Adobe-Identity-UCS" + " def");
        writeLine(writer, "/CMapType 2 def\n"); // 2 = ToUnicode

        if (wMode != 0)
        {
            writeLine(writer, "/WMode /" + wMode + " def");
        }

        // ToUnicode always uses 16-bit CIDs
        writeLine(writer, "1 begincodespacerange");
        writeLine(writer, "<0000> <FFFF>");
        writeLine(writer, "endcodespacerange\n");

        // CID -> Unicode mappings, we use ranges to generate a smaller CMap
        List<Integer> srcFrom = new ArrayList<>();
        List<Integer> srcTo = new ArrayList<>();
        List<String> dstString = new ArrayList<>();

        Map.Entry<Integer, String> prev = null;

        for (Map.Entry<Integer, String> next : cidToUnicode.entrySet())
        {
            if (allowCIDToUnicodeRange(prev, next))
            {
                // extend range
                srcTo.set(srcTo.size() - 1, next.getKey());
            }
            else
            {
                // begin range
                srcFrom.add(next.getKey());
                srcTo.add(next.getKey());
                dstString.add(next.getValue());
            }
            prev = next;
        }

        // limit entries per operator
        int batchCount = (int) Math.ceil(srcFrom.size() /
                                         (double) MAX_ENTRIES_PER_OPERATOR);
        for (int batch = 0; batch < batchCount; batch++)
        {
            int count = batch == batchCount - 1 ?
                            srcFrom.size() - MAX_ENTRIES_PER_OPERATOR * batch :
                            MAX_ENTRIES_PER_OPERATOR;
            writer.write(count + " beginbfrange\n");
            for (int j = 0; j < count; j++)
            {
                int index = batch * MAX_ENTRIES_PER_OPERATOR + j;
                writer.write('<');
                writer.write(Hex.getChars(srcFrom.get(index).shortValue()));
                writer.write("> ");

                writer.write('<');
                writer.write(Hex.getChars(srcTo.get(index).shortValue()));
                writer.write("> ");

                writer.write('<');
                writer.write(Hex.getCharsUTF16BE(dstString.get(index)));
                writer.write(">\n");
            }
            writeLine(writer, "endbfrange\n");
        }

        // footer
        writeLine(writer, "endcmap");
        writeLine(writer, "CMapName currentdict /CMap defineresource pop");
        writeLine(writer, "end");
        writeLine(writer, "end");

        writer.flush();
    }

    private void writeLine(BufferedWriter writer, String text) throws IOException
    {
        writer.write(text);
        writer.write('\n');
    }

    // allowCIDToUnicodeRange returns true if the CID and Unicode destination string are allowed to follow one another
    // according to the Adobe 1.7 specification as described in Section 5.9, Example 5.16.
    static boolean allowCIDToUnicodeRange(Map.Entry<Integer, String> prev,
            Map.Entry<Integer, String> next)
    {
        if (prev == null || next == null)
        {
            return false;
        }
        return allowCodeRange(prev.getKey(), next.getKey())
                && allowDestinationRange(prev.getValue(), next.getValue());
    }

    // allowCodeRange returns true if the 16-bit values are sequential and differ only in the low-order byte.
    static boolean allowCodeRange(int prev, int next)
    {
        if ((prev + 1) != next)
        {
            return false;
        }
        int prevH = (prev >> 8) & 0xFF;
        int prevL = prev & 0xFF;
        int nextH = (next >> 8) & 0xFF;
        int nextL = next & 0xFF;

        return prevH == nextH && prevL < nextL;
    }

    // allowDestinationRange returns true if the code points represented by the strings are sequential and differ
    // only in the low-order byte.
    static boolean allowDestinationRange(String prev, String next)
    {
        if (prev.isEmpty() || next.isEmpty())
        {
            return false;
        }
        int prevCode = prev.codePointAt(0);
        int nextCode = next.codePointAt(0);

        // Allow the new destination string if:
        // 1. It is sequential with the previous one and differs only in the low-order byte
        // 2. The previous string does not contain any UTF-16 surrogates
        return allowCodeRange(prevCode, nextCode) && prev.codePointCount(0, prev.length()) == 1;
    }
}