File: clip_tokenizer.h

package info (click to toggle)
pytorch-text 0.14.1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 11,560 kB
  • sloc: python: 14,197; cpp: 2,404; sh: 214; makefile: 20
file content (51 lines) | stat: -rw-r--r-- 1,522 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#ifndef CLIP_TOKENIZER_H_
#define CLIP_TOKENIZER_H_

#include <torchtext/csrc/export.h>
#include <torchtext/csrc/gpt2_bpe_tokenizer.h>

namespace torchtext {

typedef std::tuple<
    std::unordered_map<std::string, int64_t>,
    std::unordered_map<std::string, int64_t>,
    std::string,
    std::unordered_map<int64_t, std::string>,
    bool>
    CLIPEncoderStatesPybind;

typedef std::tuple<
    c10::Dict<std::string, int64_t>,
    c10::Dict<std::string, int64_t>,
    std::string,
    c10::Dict<int64_t, std::string>,
    bool>
    CLIPEncoderStatesTorchbind;

struct CLIPEncoder : GPT2BPEEncoder {
 public:
  using GPT2BPEEncoder::GPT2BPEEncoder;

  TORCHTEXT_API std::vector<int64_t> Encode(const std::string& text);
  TORCHTEXT_API std::vector<std::string> Tokenize(const std::string& text);

 protected:
  TORCHTEXT_API std::vector<std::string> BPE_(
      const std::vector<std::string>& token_list) override;

  TORCHTEXT_API std::vector<std::string> PreTokenize_(
      std::string input) override;
};

TORCHTEXT_API CLIPEncoderStatesPybind
_serialize_clip_encoder_pybind(const c10::intrusive_ptr<CLIPEncoder>& self);
CLIPEncoderStatesTorchbind _serialize_clip_encoder_torchbind(
    const c10::intrusive_ptr<CLIPEncoder>& self);
TORCHTEXT_API c10::intrusive_ptr<CLIPEncoder> _deserialize_clip_encoder_pybind(
    CLIPEncoderStatesPybind states);
c10::intrusive_ptr<CLIPEncoder> _deserialize_clip_encoder_torchbind(
    CLIPEncoderStatesTorchbind states);

} // namespace torchtext

#endif // CLIP_TOKENIZER_H_