/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is HTML Parser C++ Translator code.
 *
 * The Initial Developer of the Original Code is
 * Mozilla Foundation.
 * Portions created by the Initial Developer are Copyright (C) 2008
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Henri Sivonen <hsivonen@iki.fi>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

package nu.validator.htmlparser.generator;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import nu.validator.htmlparser.cpptranslate.CppTypes;

public class GenerateNamedCharactersCpp {

    /**
     * The license for the output of this program except for data files.
     */
    private static final String OUTPUT_LICENSE = "/*\n"
            + " * Copyright (c) 2008-2010 Mozilla Foundation\n"
            + " *\n"
            + " * Permission is hereby granted, free of charge, to any person obtaining a \n"
            + " * copy of this software and associated documentation files (the \"Software\"), \n"
            + " * to deal in the Software without restriction, including without limitation \n"
            + " * the rights to use, copy, modify, merge, publish, distribute, sublicense, \n"
            + " * and/or sell copies of the Software, and to permit persons to whom the \n"
            + " * Software is furnished to do so, subject to the following conditions:\n"
            + " *\n"
            + " * The above copyright notice and this permission notice shall be included in \n"
            + " * all copies or substantial portions of the Software.\n"
            + " *\n"
            + " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"
            + " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"
            + " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL \n"
            + " * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"
            + " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING \n"
            + " * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER \n"
            + " * DEALINGS IN THE SOFTWARE.\n" + " */\n\n";
    
    /**
     * The license for the generated data files.
     */
    private static final String DATA_LICENSE = "/*\n"
            + " * Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and Opera \n"
            + " * Software ASA.\n"
            + " * \n"
            + " * You are granted a license to use, reproduce and create derivative works of \n"
            + " * this document.\n" + " */\n\n";

    private static final int LEAD_OFFSET = 0xD800 - (0x10000 >> 10);

    private static final Pattern LINE_PATTERN = Pattern.compile("<td> <code title=\"\">([^<]*)</code> </td> <td> U\\+(\\S*) (?:U\\+(\\S*) )?</td>");

    private static String toHexString(int c) {
        String hexString = Integer.toHexString(c);
        switch (hexString.length()) {
            case 1:
                return "0x000" + hexString;
            case 2:
                return "0x00" + hexString;
            case 3:
                return "0x0" + hexString;
            case 4:
                return "0x" + hexString;
            default:
                throw new RuntimeException("Unreachable.");
        }
    }

    /**
     * @param args
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
        TreeMap<String, String> entities = new TreeMap<String, String>();
        BufferedReader reader = new BufferedReader(new InputStreamReader(
                new FileInputStream(args[0]), "utf-8"));
        String line;
        while ((line = reader.readLine()) != null) {
            Matcher m = LINE_PATTERN.matcher(line);
            while (m.find()) {
                String value;
                if (m.group(3) != null) {
                    // two BMP chars
                    int firstIntVal = Integer.parseInt(m.group(2), 16);
                    int secondIntVal = Integer.parseInt(m.group(3), 16);
                    value = ("" + (char)firstIntVal) + (char)secondIntVal;
                } else {
                    // one code point
                    int intVal = Integer.parseInt(m.group(2), 16);
                    if (intVal <= 0xFFFF) {
                        value = "" + (char)intVal;
                    } else {
                        int high = (LEAD_OFFSET + (intVal >> 10));
                        int low = (0xDC00 + (intVal & 0x3FF));
                        value = ("" + (char)high) + (char)low;
                    }
                }
                entities.put(m.group(1), value);
            }
        }

        CppTypes cppTypes = new CppTypes(null);
        File targetDirectory = new File(args[1]);

        generateH(targetDirectory, cppTypes, entities);
        generateInclude(targetDirectory, cppTypes, entities);
        generateCpp(targetDirectory, cppTypes, entities);
        generateAccelH(targetDirectory, cppTypes, entities);
        generateAccelCpp(targetDirectory, cppTypes, entities);
    }

    private static void generateAccelCpp(File targetDirectory,
            CppTypes cppTypes, TreeMap<String, String> entities) throws IOException {
        String includeFile = cppTypes.classPrefix()
                + "NamedCharactersInclude.h";
        File cppFile = new File(targetDirectory, cppTypes.classPrefix()
                + "NamedCharactersAccel.cpp");
        Writer out = new OutputStreamWriter(new FileOutputStream(cppFile),
                "utf-8");

        out.write(DATA_LICENSE);
        out.write('\n');
        out.write("#include \"" + cppTypes.classPrefix()
                + "NamedCharactersAccel.h\"\n");
        out.write("\n");

        // Java initializes arrays to zero. Zero is our magic value for no hilo
        // value.
        int[][] hiLoTable = new int['z' + 1]['Z' - 'A' + 1 + 'z' - 'a' + 1];

        String firstName = entities.entrySet().iterator().next().getKey();
        int firstKey = charToIndex(firstName.charAt(0));
        int secondKey = firstName.charAt(1);
        int row = 0;
        int lo = 0;

        for (Map.Entry<String, String> entity : entities.entrySet()) {
            String name = entity.getKey();
            int newFirst = charToIndex(name.charAt(0));
            int newSecond = name.charAt(1);
            assert !(newFirst == 0 && newSecond == 0) : "Not prepared for name starting with AA";
            if (firstKey != newFirst || secondKey != newSecond) {
                hiLoTable[secondKey][firstKey] = ((row - 1) << 16) | lo;
                lo = row;
                firstKey = newFirst;
                secondKey = newSecond;
            }
            row++;
        }

        hiLoTable[secondKey][firstKey] = ((entities.size() - 1) << 16) | lo;

        for (int i = 0; i < hiLoTable.length; i++) {
            if (!allZero(hiLoTable[i])) {
                out.write("static " + cppTypes.intType() + " const HILO_ACCEL_"
                        + i + "[] = {\n");
                for (int j = 0; j < hiLoTable[i].length; j++) {
                    if (j != 0) {
                        out.write(", ");
                    }
                    out.write("" + hiLoTable[i][j]);
                }
                out.write("\n};\n\n");
            }
        }

        out.write("const PRInt32* const " + cppTypes.classPrefix()
                + "NamedCharactersAccel::HILO_ACCEL[] = {\n");
        for (int i = 0; i < hiLoTable.length; i++) {
            if (i != 0) {
                out.write(",\n");
            }
            if (allZero(hiLoTable[i])) {
                out.write("  0");
            } else {
                out.write("  HILO_ACCEL_" + i);
            }
        }
        out.write("\n};\n\n");

        out.flush();
        out.close();
    }

    private static void generateAccelH(File targetDirectory, CppTypes cppTypes,
            TreeMap<String, String> entities) throws IOException {
        File hFile = new File(targetDirectory, cppTypes.classPrefix()
                + "NamedCharactersAccel.h");
        Writer out = new OutputStreamWriter(new FileOutputStream(hFile),
                "utf-8");
        out.write(DATA_LICENSE);
        out.write("#ifndef " + cppTypes.classPrefix() + "NamedCharactersAccel_h_\n");
        out.write("#define " + cppTypes.classPrefix() + "NamedCharactersAccel_h_\n");
        out.write('\n');

        String[] includes = cppTypes.namedCharactersIncludes();
        for (int i = 0; i < includes.length; i++) {
            String include = includes[i];
            out.write("#include \"" + include + ".h\"\n");
        }

        out.write('\n');

        out.write("class " + cppTypes.classPrefix() + "NamedCharactersAccel\n");
        out.write("{\n");
        out.write("  public:\n");
        out.write("    static const " + cppTypes.intType()
                + "* const HILO_ACCEL[];\n");
        out.write("};\n");

        out.write("\n#endif // " + cppTypes.classPrefix()
                + "NamedCharactersAccel_h_\n");
        out.flush();
        out.close();
    }

    private static void generateH(File targetDirectory, CppTypes cppTypes,
            Map<String, String> entities) throws IOException {
        File hFile = new File(targetDirectory, cppTypes.classPrefix()
                + "NamedCharacters.h");
        Writer out = new OutputStreamWriter(new FileOutputStream(hFile),
                "utf-8");
        out.write(OUTPUT_LICENSE);
        out.write("#ifndef " + cppTypes.classPrefix() + "NamedCharacters_h_\n");
        out.write("#define " + cppTypes.classPrefix() + "NamedCharacters_h_\n");
        out.write('\n');

        String[] includes = cppTypes.namedCharactersIncludes();
        for (int i = 0; i < includes.length; i++) {
            String include = includes[i];
            out.write("#include \"" + include + ".h\"\n");
        }

        out.write("\nstruct ");
        out.write(cppTypes.characterNameTypeDeclaration());
        out.write(" {\n  ");
        out.write(cppTypes.unsignedShortType());
        out.write(" nameStart;\n  ");
        out.write(cppTypes.unsignedShortType());
        out.write(" nameLen;\n  #ifdef DEBUG\n  ");
        out.write(cppTypes.intType());
        out.write(" n;\n  #endif\n  ");
        out.write(cppTypes.intType());
        out.write(" length() const;\n  ");
        out.write(cppTypes.charType());
        out.write(" charAt(");
        out.write(cppTypes.intType());
        out.write(" index) const;\n};\n\n");

        out.write("class " + cppTypes.classPrefix() + "NamedCharacters\n");
        out.write("{\n");
        out.write("  public:\n");
        out.write("    static const " + cppTypes.characterNameTypeDeclaration() + " NAMES[];\n");
        out.write("    static const " + cppTypes.charType() + " VALUES[][2];\n");
        out.write("    static " + cppTypes.charType() + "** WINDOWS_1252;\n");
        out.write("    static void initializeStatics();\n");
        out.write("    static void releaseStatics();\n");
        out.write("};\n");

        out.write("\n#endif // " + cppTypes.classPrefix()
                + "NamedCharacters_h_\n");
        out.flush();
        out.close();
    }

    private static void generateInclude(File targetDirectory,
            CppTypes cppTypes, Map<String, String> entities) throws IOException {
        File includeFile = new File(targetDirectory, cppTypes.classPrefix()
                + "NamedCharactersInclude.h");
        Writer out = new OutputStreamWriter(new FileOutputStream(includeFile),
                "utf-8");

        out.write(DATA_LICENSE);
        out.write("/* Data generated from the table of named character references found at\n");
        out.write(" *\n");
        out.write(" *   http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html#named-character-references\n");
        out.write(" *\n");
        out.write(" * Files that #include this file must #define NAMED_CHARACTER_REFERENCE as a\n");
        out.write(" * macro of four parameters:\n");
        out.write(" *\n");
        out.write(" *   1.  a unique integer N identifying the Nth [0,1,..] macro expansion in this file,\n");
        out.write(" *   2.  a comma-separated sequence of characters comprising the character name,\n");
        out.write(" *       without the first two letters or 0 if the sequence would be empty. \n");
        out.write(" *       See Tokenizer.java.\n");
        out.write(" *   3.  the length of this sequence of characters,\n");
        out.write(" *   4.  placeholder flag (0 if argument #is not a placeholder and 1 if it is),\n");
        out.write(" *   5.  a comma-separated sequence of PRUnichar literals corresponding\n");
        out.write(" *       to the code-point(s) of the named character.\n");
        out.write(" *\n");
        out.write(" * The macro expansion doesn't have to refer to all or any of these parameters,\n");
        out.write(" * but common sense dictates that it should involve at least one of them.\n");
        out.write(" */\n");
        out.write("\n");
        out.write("// This #define allows the NAMED_CHARACTER_REFERENCE macro to accept comma-\n");
        out.write("// separated sequences as single macro arguments.  Using commas directly would\n");
        out.write("// split the sequence into multiple macro arguments.\n");
        out.write("#define _ ,\n");
        out.write("\n");

        int i = 0;
        for (Map.Entry<String, String> entity : entities.entrySet()) {
            out.write("NAMED_CHARACTER_REFERENCE(" + i++ + ", ");
            String name = entity.getKey();
            writeNameInitializer(out, name, " _ ");
            out.write(", " + (name.length() - 2) + ", ");
            out.write((name.length() == 2 ? "1" : "0") + ", ");
            writeValueInitializer(out, entity.getValue(), " _ ");
            out.write(")\n");
        }

        out.write("\n");
        out.write("#undef _\n");

        out.flush();
        out.close();
    }

    private static void writeNameInitializer(Writer out,
            String name, String separator)
            throws IOException {
        out.write("/* " + name.charAt(0) + " " + name.charAt(1) + " */ ");
        if (name.length() == 2) {
            out.write("0");
        } else {
            for (int i = 2; i < name.length(); i++) {
                out.write("'" + name.charAt(i) + "'");
                if (i < name.length() - 1)
                    out.write(separator);
            }            
        }
    }

    private static void writeValueInitializer(Writer out,
            String value, String separator)
            throws IOException {
        if (value.length() == 1) {
            out.write(toHexString(value.charAt(0)));
            out.write(separator);
            out.write("0");
        } else {
            out.write(toHexString(value.charAt(0)));
            out.write(separator);
            out.write(toHexString(value.charAt(1)));
        }
    }

    private static void defineMacroAndInclude(Writer out, String expansion,
            String includeFile) throws IOException {
        out.write("#define NAMED_CHARACTER_REFERENCE(N, CHARS, LEN, FLAG, VALUE) \\\n"
                + expansion + "\n");
        out.write("#include \"" + includeFile + "\"\n");
        out.write("#undef NAMED_CHARACTER_REFERENCE\n");
    }

    private static void defineMacroAndInclude(Writer out, String expansion,
            String debugExpansion, String includeFile) throws IOException {
        out.write("#ifdef DEBUG\n");
        out.write("  #define NAMED_CHARACTER_REFERENCE(N, CHARS, LEN, FLAG, VALUE) \\\n"
                + debugExpansion + "\n");
        out.write("#else\n");
        out.write("  #define NAMED_CHARACTER_REFERENCE(N, CHARS, LEN, FLAG, VALUE) \\\n"
                + expansion + "\n");
        out.write("#endif\n");
        out.write("#include \"" + includeFile + "\"\n");
        out.write("#undef NAMED_CHARACTER_REFERENCE\n");
    }

    private static void writeStaticMemberDeclaration(Writer out,
            CppTypes cppTypes, String type, String name) throws IOException {
        out.write(type + " " + cppTypes.classPrefix() + "NamedCharacters::"
                + name + ";\n");
    }

    private static int charToIndex(char c) {
        if (c >= 'a' && c <= 'z') {
            return c - 'a' + 26;
        } else if (c >= 'A' && c <= 'Z') {
            return c - 'A';
        }
        throw new IllegalArgumentException("Bad char in named character name: "
                + c);
    }

    private static boolean allZero(int[] arr) {
        for (int i = 0; i < arr.length; i++) {
            if (arr[i] != 0) {
                return false;
            }
        }
        return true;
    }

    private static void generateCpp(File targetDirectory, CppTypes cppTypes,
            Map<String, String> entities) throws IOException {
        String includeFile = cppTypes.classPrefix()
                + "NamedCharactersInclude.h";
        File cppFile = new File(targetDirectory, cppTypes.classPrefix()
                + "NamedCharacters.cpp");
        Writer out = new OutputStreamWriter(new FileOutputStream(cppFile),
                "utf-8");

        out.write(OUTPUT_LICENSE);
        out.write("#define " + cppTypes.classPrefix()
                + "NamedCharacters_cpp_\n");

        String[] includes = cppTypes.namedCharactersIncludes();
        for (int i = 0; i < includes.length; i++) {
            String include = includes[i];
            out.write("#include \"" + include + ".h\"\n");
        }

        out.write('\n');
        out.write("#include \"" + cppTypes.classPrefix()
                + "NamedCharacters.h\"\n");
        out.write("\n");

        out.write("const " + cppTypes.charType() + " " + cppTypes.classPrefix()
                + "NamedCharacters::VALUES[][2] = {\n");
        defineMacroAndInclude(out, "{ VALUE },", includeFile);
        // The useless terminator entry makes the above macro simpler with
        // compilers that whine about a comma after the last item
        out.write("{0, 0} };\n\n");

        String staticMemberType = cppTypes.charType() + "**";
        writeStaticMemberDeclaration(out, cppTypes, staticMemberType,
                "WINDOWS_1252");

        out.write("static " + cppTypes.charType()
                + " const WINDOWS_1252_DATA[] = {\n");
        out.write("  0x20AC,\n");
        out.write("  0x0081,\n");
        out.write("  0x201A,\n");
        out.write("  0x0192,\n");
        out.write("  0x201E,\n");
        out.write("  0x2026,\n");
        out.write("  0x2020,\n");
        out.write("  0x2021,\n");
        out.write("  0x02C6,\n");
        out.write("  0x2030,\n");
        out.write("  0x0160,\n");
        out.write("  0x2039,\n");
        out.write("  0x0152,\n");
        out.write("  0x008D,\n");
        out.write("  0x017D,\n");
        out.write("  0x008F,\n");
        out.write("  0x0090,\n");
        out.write("  0x2018,\n");
        out.write("  0x2019,\n");
        out.write("  0x201C,\n");
        out.write("  0x201D,\n");
        out.write("  0x2022,\n");
        out.write("  0x2013,\n");
        out.write("  0x2014,\n");
        out.write("  0x02DC,\n");
        out.write("  0x2122,\n");
        out.write("  0x0161,\n");
        out.write("  0x203A,\n");
        out.write("  0x0153,\n");
        out.write("  0x009D,\n");
        out.write("  0x017E,\n");
        out.write("  0x0178\n");
        out.write("};\n\n");

        out.write("/**\n");
        out.write(" * To avoid having lots of pointers in the |charData| array, below,\n");
        out.write(" * which would cause us to have to do lots of relocations at library\n");
        out.write(" * load time, store all the string data for the names in one big array.\n");
        out.write(" * Then use tricks with enums to help us build an array that contains\n");
        out.write(" * the positions of each within the big arrays.\n");
        out.write(" */\n\n");

        out.write("static const " + cppTypes.byteType() + " ALL_NAMES[] = {\n");

        defineMacroAndInclude(out, "CHARS ,", includeFile);

        out.write("};\n\n");        
        
        out.write("enum NamePositions {\n");
        out.write("  DUMMY_INITIAL_NAME_POSITION = 0,\n");

        out.write("/* enums don't take up space, so generate _START and _END */\n");
        defineMacroAndInclude(out,
                "NAME_##N##_DUMMY, /* automatically one higher than previous */ \\\n"
                        + "NAME_##N##_START = NAME_##N##_DUMMY - 1, \\\n"
                        + "NAME_##N##_END = NAME_##N##_START + LEN + FLAG,",
                includeFile);

        out.write("  DUMMY_FINAL_NAME_VALUE\n");
        out.write("};\n\n");

        String arrayLengthMacro = cppTypes.arrayLengthMacro();
        String staticAssert = cppTypes.staticAssert();
        if (staticAssert != null && arrayLengthMacro != null) {
            out.write("/* check that the start positions will fit in 16 bits */\n");
            out.write(staticAssert + "(" + arrayLengthMacro
                    + "(ALL_NAMES) < 0x10000);\n\n");
        }
        
        out.write("const " + cppTypes.characterNameTypeDeclaration() + " " + cppTypes.classPrefix()
                + "NamedCharacters::NAMES[] = {\n");
        defineMacroAndInclude(out, "{ NAME_##N##_START, LEN, },", "{ NAME_##N##_START, LEN, N },", includeFile);
        out.write("};\n\n");

        out.write(cppTypes.intType());
        out.write("\n");
        out.write(cppTypes.characterNameTypeDeclaration());
        out.write("::length() const\n{\n  return nameLen;\n}\n\n");
        out.write(cppTypes.charType());
        out.write("\n");
        out.write(cppTypes.characterNameTypeDeclaration());
        out.write("::charAt(");
        out.write("PRInt32");
        out.write(" index) const\n{\n  return static_cast<");
        out.write(cppTypes.charType());
        out.write("> (ALL_NAMES[nameStart + index]);\n}\n\n");
        
        out.write("void\n");
        out.write(cppTypes.classPrefix()
                + "NamedCharacters::initializeStatics()\n");
        out.write("{\n");
        out.write("  WINDOWS_1252 = new " + cppTypes.charType() + "*[32];\n");
        out.write("  for (" + cppTypes.intType() + " i = 0; i < 32; ++i) {\n");
        out.write("    WINDOWS_1252[i] = (" + cppTypes.charType()
                + "*)&(WINDOWS_1252_DATA[i]);\n");
        out.write("  }\n");
        out.write("}\n");
        out.write("\n");

        out.write("void\n");
        out.write(cppTypes.classPrefix()
                + "NamedCharacters::releaseStatics()\n");
        out.write("{\n");
        out.write("  delete[] WINDOWS_1252;\n");
        out.write("}\n");
        out.flush();
        out.close();
    }
}
