From: Carbo Kuo <byvoid@byvoid.com>
Date: Thu, 25 Feb 2021 21:13:38 +0900
Subject: Fix a severe performance bug in `Conversion::Convert` that caused
 O(N^2) complexity.

In `Conversion.cpp`, line 27:
```
    Optional<const DictEntry*> matched = dict->MatchPrefix(pstr);
```
pstr is a `const char*`. However, there is no overloaded function which parameter `const char*`.
Therefore it matches `Optional<const DictEntry*> MatchPrefix(const std::string& word) const`.
There is an implicit type conversion from `char*` to `std::string` with time complexity O(N).

I added new benchmark tests. Before the fix:

1: ------------------------------------------------------------------
1: Benchmark                        Time             CPU   Iterations
1: ------------------------------------------------------------------
1: BM_Initialization/hk2s        1.17 ms         1.12 ms          645
1: BM_Initialization/hk2t       0.116 ms        0.116 ms         5922
1: BM_Initialization/jp2t       0.206 ms        0.201 ms         3500
1: BM_Initialization/s2hk        18.2 ms         17.9 ms           40
1: BM_Initialization/s2t         18.2 ms         18.1 ms           39
1: BM_Initialization/s2tw        17.9 ms         17.8 ms           39
1: BM_Initialization/s2twp       18.6 ms         18.4 ms           39
1: BM_Initialization/t2hk       0.055 ms        0.054 ms        12907
1: BM_Initialization/t2jp       0.120 ms        0.117 ms         5978
1: BM_Initialization/t2s        0.988 ms        0.984 ms          710
1: BM_Initialization/tw2s        1.08 ms         1.05 ms          672
1: BM_Initialization/tw2sp       1.26 ms         1.24 ms          563
1: BM_Initialization/tw2t       0.088 ms        0.083 ms         8528
1: BM_Convert2M                   413 ms          413 ms            2
1: BM_Convert/100                1.09 ms         1.09 ms          629
1: BM_Convert/1000               33.2 ms         33.2 ms           21
1: BM_Convert/10000              2822 ms         2822 ms            1
1: BM_Convert/100000 (took longer than 5 minutes, killed)

Now:
1: ------------------------------------------------------------------
1: Benchmark                        Time             CPU   Iterations
1: ------------------------------------------------------------------
1: BM_Initialization/hk2s        1.07 ms         1.07 ms          650
1: BM_Initialization/hk2t       0.114 ms        0.114 ms         6092
1: BM_Initialization/jp2t       0.204 ms        0.200 ms         3503
1: BM_Initialization/s2hk        18.2 ms         18.0 ms           40
1: BM_Initialization/s2t         17.6 ms         17.6 ms           39
1: BM_Initialization/s2tw        18.0 ms         17.9 ms           40
1: BM_Initialization/s2twp       17.9 ms         17.9 ms           39
1: BM_Initialization/t2hk       0.055 ms        0.055 ms        12855
1: BM_Initialization/t2jp       0.125 ms        0.124 ms         5772
1: BM_Initialization/t2s        1.000 ms        0.989 ms          695
1: BM_Initialization/tw2s        1.09 ms         1.07 ms          668
1: BM_Initialization/tw2sp       1.29 ms         1.26 ms          558
1: BM_Initialization/tw2t       0.082 ms        0.082 ms         8558
1: BM_Convert2M                   361 ms          361 ms            2
1: BM_Convert/100               0.741 ms        0.740 ms          948
1: BM_Convert/1000               7.54 ms         7.52 ms           94
1: BM_Convert/10000              76.3 ms         76.3 ms            9
1: BM_Convert/100000              786 ms          786 ms            1

This bug was reported in https://github.com/BYVoid/OpenCC/issues/478 and https://github.com/BYVoid/OpenCC/issues/517.
---
 src/Dict.hpp                  |  7 ++++
 src/benchmark/Performance.cpp | 77 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/src/Dict.hpp b/src/Dict.hpp
index 461d6d2..1c81034 100644
--- a/src/Dict.hpp
+++ b/src/Dict.hpp
@@ -49,6 +49,13 @@ public:
   virtual Optional<const DictEntry*> MatchPrefix(const char* word,
                                                  size_t len) const;
 
+  /**
+   * Matches the longest matched prefix of a word.
+   */
+  Optional<const DictEntry*> MatchPrefix(const char* word) const {
+    return MatchPrefix(word, KeyMaxLength());
+  }
+
   /**
    * Matches the longest matched prefix of a word.
    */
diff --git a/src/benchmark/Performance.cpp b/src/benchmark/Performance.cpp
index cf8d3aa..d1b6468 100644
--- a/src/benchmark/Performance.cpp
+++ b/src/benchmark/Performance.cpp
@@ -1,7 +1,26 @@
+/*
+ * Open Chinese Convert
+ *
+ * Copyright 2020-2021 Carbo Kuo <byvoid@byvoid.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 #include <benchmark/benchmark.h>
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <sstream>
 #include <streambuf>
 
 #ifdef _MSC_VER
@@ -44,21 +63,32 @@ static void BM_Initialization(benchmark::State& state,
     state.ResumeTiming();
   }
 }
-BENCHMARK_CAPTURE(BM_Initialization, hk2s, "hk2s");
-BENCHMARK_CAPTURE(BM_Initialization, hk2t, "hk2t");
-BENCHMARK_CAPTURE(BM_Initialization, jp2t, "jp2t");
-BENCHMARK_CAPTURE(BM_Initialization, s2hk, "s2hk");
-BENCHMARK_CAPTURE(BM_Initialization, s2t, "s2t");
-BENCHMARK_CAPTURE(BM_Initialization, s2tw, "s2tw");
-BENCHMARK_CAPTURE(BM_Initialization, s2twp, "s2twp");
-BENCHMARK_CAPTURE(BM_Initialization, t2hk, "t2hk");
-BENCHMARK_CAPTURE(BM_Initialization, t2jp, "t2jp");
-BENCHMARK_CAPTURE(BM_Initialization, t2s, "t2s");
-BENCHMARK_CAPTURE(BM_Initialization, tw2s, "tw2s");
-BENCHMARK_CAPTURE(BM_Initialization, tw2sp, "tw2sp");
-BENCHMARK_CAPTURE(BM_Initialization, tw2t, "tw2t");
+BENCHMARK_CAPTURE(BM_Initialization, hk2s, "hk2s")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, hk2t, "hk2t")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, jp2t, "jp2t")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, s2hk, "s2hk")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, s2t, "s2t")->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, s2tw, "s2tw")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, s2twp, "s2twp")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, t2hk, "t2hk")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, t2jp, "t2jp")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, t2s, "t2s")->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, tw2s, "tw2s")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, tw2sp, "tw2sp")
+    ->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Initialization, tw2t, "tw2t")
+    ->Unit(benchmark::kMillisecond);
 
-static void BM_Convert(benchmark::State& state) {
+static void BM_Convert2M(benchmark::State& state) {
   const std::string config_name = "s2t";
   const std::string text = ReadText("zuozhuan.txt");
   const std::unique_ptr<SimpleConverter> converter(Initialize(config_name));
@@ -66,7 +96,24 @@ static void BM_Convert(benchmark::State& state) {
     Convert(converter.get(), text);
   }
 }
-BENCHMARK(BM_Convert)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Convert2M)->Unit(benchmark::kMillisecond);
+
+static void BM_Convert(benchmark::State& state, int iteration) {
+  std::ostringstream os;
+  for (int i = 0; i < iteration; i++) {
+    os << "Open Chinese Convert 開放中文轉換" << i << std::endl;
+  }
+  const std::string text = os.str();
+  const std::string config_name = "s2t";
+  const std::unique_ptr<SimpleConverter> converter(Initialize(config_name));
+  for (auto _ : state) {
+    Convert(converter.get(), text);
+  }
+}
+BENCHMARK_CAPTURE(BM_Convert, 100, 100)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Convert, 1000, 1000)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Convert, 10000, 10000)->Unit(benchmark::kMillisecond);
+BENCHMARK_CAPTURE(BM_Convert, 100000, 100000)->Unit(benchmark::kMillisecond);
 
 } // namespace opencc
 
