/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ // @lint-ignore-every LICENSELINT // A tokenizer that works with sentencepiece. Used by Llama2. #pragma once #include #include #include #include "sentencepiece_processor.h" namespace tokenizers { class SPTokenizer : public Tokenizer { public: explicit SPTokenizer(); ~SPTokenizer() override; Error load(const std::string& tokenizer_path) override; Result id_to_piece(uint64_t token) const override; Result piece_to_id(const std::string& text) const override; Result> encode(const std::string& input, int8_t bos, int8_t eos) const override; Result decode( uint64_t prev_token, uint64_t token, bool skip_special_tokens = false) const override; private: std::unique_ptr _processor; }; } // namespace tokenizers