File: StringUtils.cpp

package info (click to toggle)
pbseqlib 5.3.4%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 7,020 kB
  • sloc: cpp: 77,246; python: 331; sh: 103; makefile: 42
file content (140 lines) | stat: -rw-r--r-- 3,946 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#include <pbdata/Types.h>
#include <pbdata/MD5Utils.hpp>
#include <pbdata/StringUtils.hpp>

#include <cstdint>
#include <sstream>
#include <string>
#include <vector>

int ExactPatternMatch(std::string orig, std::string pattern)
{
    std::string::size_type pos = orig.find(pattern);
    if (pos == orig.npos) {
        return 0;
    } else {
        return 1;
    }
}

void MakeMD5(const char *data, unsigned int dataLength, std::string &md5Str, int nChars)
{

    MD5 md5engine;
    md5engine.update((unsigned char *)data, dataLength);
    md5engine.finalize();

    char *md5c_str = md5engine.hex_digest();
    assert(md5c_str != NULL);
    if (nChars == 0) {
        nChars = 32;
    }
    md5Str.assign(md5c_str, nChars);
    delete[] md5c_str;
}

void MakeMD5(std::string &data, std::string &md5Str, int nChars)
{
    MakeMD5(data.c_str(), data.size(), md5Str, nChars);
}

int IsWhitespace(char c) { return (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\0'); }

int IsSpace(char c) { return (c == ' ' || c == '\t'); }

size_t ToWords(std::string &orig, std::vector<std::string> &words)
{
    size_t curWordStart, curWordEnd;
    curWordStart = 0;
    while (curWordStart < orig.size()) {
        while (curWordStart < orig.size() and IsSpace(orig[curWordStart])) {
            curWordStart++;
        }
        curWordEnd = curWordStart;
        while (curWordEnd < orig.size() and !IsSpace(orig[curWordEnd])) {
            curWordEnd++;
        }
        std::string word;
        if (curWordEnd != curWordStart) {
            word.assign(orig, curWordStart, curWordEnd - curWordStart);
            words.push_back(word);
        }
        curWordStart = curWordEnd;
    }
    return words.size();
}

// Splice a string by pattern and save to a vector of token strings.
int Splice(const std::string &orig, const std::string &pattern, std::vector<std::string> &tokens)
{
    assert(pattern.size() > 0);

    tokens.clear();
    size_t search_start = 0;
    size_t find_pos = orig.find(pattern, search_start);
    while (find_pos != std::string::npos) {
        std::string x = orig.substr(search_start, find_pos - search_start);
        tokens.push_back(x);
        search_start = find_pos + pattern.size();
        find_pos = orig.find(pattern, search_start);
    }
    tokens.push_back(orig.substr(search_start));
    return tokens.size();
}

void ParseSeparatedList(const std::string &csl, std::vector<std::string> &values, char delim)
{
    std::stringstream cslStrm(csl);
    std::string valString;
    std::string next;
    do {
        if (std::getline(cslStrm, valString, delim)) {
            if (valString.size() > 0) {
                values.push_back(valString);
            }
        }
    } while (cslStrm);
}

int AssignUntilFirstSpace(char *orig, int origLength, std::string &result)
{
    int i;
    for (i = 0; i < origLength; i++) {
        if (orig[i] == ' ' || orig[i] == '\t' || orig[i] == '\n' || orig[i] == '\r' ||
            orig[i] == '\0') {
            break;
        }
    }
    result.assign(orig, i);
    return i;
}

std::string RStrip(std::string &fileName)
{
    // Remove right-ended spaces
    int i = fileName.size();
    if (i == 0) {
        return "";
    }
    while (i >= 1) {
        i--;
        if (not IsWhitespace(fileName[i])) {
            break;
        }
    }
    return fileName.substr(0, i + 1);
}

std::string MakeReadGroupId(const std::string &movieName, const ReadType::ReadTypeEnum &readType)
{
    // PBBAM spec 3.0b5:
    // Read Group Id is computed as MD5(${movieName}//${readType})[0:8], where
    // movieName is PacBio platform unit id, e.g., (m140905_042...77_s1_X0),
    // readtype is SUBREAD, CCS or UNKNOWN,
    // CCS reads for a movie named "movie32" would have
    // RGID STRING = "f5b4ffb6"
    std::string seed = movieName + "//" + ReadType::ToString(readType);
    std::string readGroupId;
    MakeMD5(seed, readGroupId, 8);
    return readGroupId;
}