1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
|
#include <iostream>
#include <fstream>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/dnn/dnn.hpp>
using namespace cv;
using namespace cv::dnn;
std::string keys =
"{ help h | | Print help message. }"
"{ inputImage i | | Path to an input image. Skip this argument to capture frames from a camera. }"
"{ detModelPath dmp | | Path to a binary .onnx model for detection. "
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
"{ recModelPath rmp | | Path to a binary .onnx model for recognition. "
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
"{ inputHeight ih |736| image height of the model input. It should be multiple by 32.}"
"{ inputWidth iw |736| image width of the model input. It should be multiple by 32.}"
"{ RGBInput rgb |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
"{ binaryThreshold bt |0.3| Confidence threshold of the binary map. }"
"{ polygonThreshold pt |0.5| Confidence threshold of polygons. }"
"{ maxCandidate max |200| Max candidates of polygons. }"
"{ unclipRatio ratio |2.0| unclip ratio. }"
"{ vocabularyPath vp | alphabet_36.txt | Path to benchmarks for evaluation. "
"Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
bool sortPts(const Point& p1, const Point& p2);
int main(int argc, char** argv)
{
// Parse arguments
CommandLineParser parser(argc, argv, keys);
parser.about("Use this script to run an end-to-end inference sample of textDetectionModel and textRecognitionModel APIs\n"
"Use -h for more information");
if (argc == 1 || parser.has("help"))
{
parser.printMessage();
return 0;
}
float binThresh = parser.get<float>("binaryThreshold");
float polyThresh = parser.get<float>("polygonThreshold");
uint maxCandidates = parser.get<uint>("maxCandidate");
String detModelPath = parser.get<String>("detModelPath");
String recModelPath = parser.get<String>("recModelPath");
String vocPath = parser.get<String>("vocabularyPath");
double unclipRatio = parser.get<double>("unclipRatio");
int height = parser.get<int>("inputHeight");
int width = parser.get<int>("inputWidth");
int imreadRGB = parser.get<int>("RGBInput");
if (!parser.check())
{
parser.printErrors();
return 1;
}
// Load networks
CV_Assert(!detModelPath.empty());
TextDetectionModel_DB detector(detModelPath);
detector.setBinaryThreshold(binThresh)
.setPolygonThreshold(polyThresh)
.setUnclipRatio(unclipRatio)
.setMaxCandidates(maxCandidates);
CV_Assert(!recModelPath.empty());
TextRecognitionModel recognizer(recModelPath);
// Load vocabulary
CV_Assert(!vocPath.empty());
std::ifstream vocFile;
vocFile.open(samples::findFile(vocPath));
CV_Assert(vocFile.is_open());
String vocLine;
std::vector<String> vocabulary;
while (std::getline(vocFile, vocLine)) {
vocabulary.push_back(vocLine);
}
recognizer.setVocabulary(vocabulary);
recognizer.setDecodeType("CTC-greedy");
// Parameters for Detection
double detScale = 1.0 / 255.0;
Size detInputSize = Size(width, height);
Scalar detMean = Scalar(122.67891434, 116.66876762, 104.00698793);
detector.setInputParams(detScale, detInputSize, detMean);
// Parameters for Recognition
double recScale = 1.0 / 127.5;
Scalar recMean = Scalar(127.5);
Size recInputSize = Size(100, 32);
recognizer.setInputParams(recScale, recInputSize, recMean);
// Create a window
static const std::string winName = "Text_Spotting";
// Input data
Mat frame = imread(samples::findFile(parser.get<String>("inputImage")));
std::cout << frame.size << std::endl;
// Inference
std::vector< std::vector<Point> > detResults;
detector.detect(frame, detResults);
Mat frame2 = frame.clone();
if (detResults.size() > 0) {
// Text Recognition
Mat recInput;
if (!imreadRGB) {
cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
} else {
recInput = frame;
}
std::vector< std::vector<Point> > contours;
for (uint i = 0; i < detResults.size(); i++)
{
const auto& quadrangle = detResults[i];
CV_CheckEQ(quadrangle.size(), (size_t)4, "");
contours.emplace_back(quadrangle);
std::vector<Point2f> quadrangle_2f;
for (int j = 0; j < 4; j++)
quadrangle_2f.emplace_back(quadrangle[j]);
// Transform and Crop
Mat cropped;
fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
std::string recognitionResult = recognizer.recognize(cropped);
std::cout << i << ": '" << recognitionResult << "'" << std::endl;
putText(frame2, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 0, 255), 2);
}
polylines(frame2, contours, true, Scalar(0, 255, 0), 2);
} else {
std::cout << "No Text Detected." << std::endl;
}
imshow(winName, frame2);
waitKey();
return 0;
}
void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
{
const Size outputSize = Size(100, 32);
Point2f targetVertices[4] = {
Point(0, outputSize.height - 1),
Point(0, 0),
Point(outputSize.width - 1, 0),
Point(outputSize.width - 1, outputSize.height - 1)
};
Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
warpPerspective(frame, result, rotationMatrix, outputSize);
#if 0
imshow("roi", result);
waitKey();
#endif
}
bool sortPts(const Point& p1, const Point& p2)
{
return p1.x < p2.x;
}
|