1.翻译任务Transformer最初用于翻译任务在实验上取得不错的效果本章用libtorch库实现一个英语句子翻译成中文的例子“Welcome to PyTorch Tutorials” 翻译成 “欢迎来到派托奇教程”“Welcome to Machine Learning” 翻译成 “欢迎来到机器学习”2.定义词汇表定义 英文词汇表 和中文词汇表Pad是占位符,单词用数字表示 如Welcome to PyTorch Tutorials 的数字编码 1 2 3 5 Welcome to Machine Learning 的数字编码 1 2 4 6“欢迎来到派托奇教程”的数字编码 3 4 5 6 7 8 9 10 11欢迎来到机器学习”的数字编码 3 4 5 6 12 13 14 15单词与编号是一一对应实现代码#define PadId 0 typedef const std::unordered_map std::string, int64_t TableVocab; typedef std::vectorstd::pairint64_t, int64_t WordList; /// 翻译 /// Welcome to PyTorch Tutorials --- 欢迎来到派托奇教程 /// Welcome to Machine Learning ----- 欢迎来到机器学习 /// /翻译 TableVocab src_vocab { {Pad,PadId}, {Welcome,1}, {to,2}, {PyTorch,3}, {Machine,4}, {Tutorials,5}, {Learning,6 } }; TableVocab tgt_vocab { {Pad,PadId}, {S,1}, {E,2}, {欢,3}, {迎,4}, {来,5}, {到,6}, {派,7}, {托,8}, {奇,9}, {教,10}, {程,11}, {机,12}, {器,13}, {学,14}, {习,15} }; int64_t src_vocab_size src_vocab.size(); int64_t tgt_vocab_size tgt_vocab.size(); std::vectorstd::string Split(const std::string s) { std::vectorstd::string res; std::stringstream ss(s); std::string word; while (ss word) { res.push_back(word); } return res; } std::string GetWordById(TableVocab vocabId,int64_t dataid) { std::string Word { 0 }; for (auto w : vocabId) { if (w.second dataid) { Word w.first; break; } } return Word; } std::vectorint64_t GetWordId(TableVocab vocabId,std::string data) { std::vectorint64_t input; for (auto ch : Split(data)) { input.push_back(vocabId.at(ch)); } return input; } WordList GetLoadDataWordId(std::pairstd::string, std::string data) { std::vectorint64_t input GetWordId(src_vocab,data.first); std::vectorint64_t target GetWordId(tgt_vocab,data.second); WordList item; for (int i 0; i input.size() i target.size(); i) { item.push_back({ input.at(i),target.at(i) }); } return item; }3.数据集自定义数据要注意以下几点1. 自定义派生类继承torch::data::datasets::Dataset,实现生成自定义数据2. 简单化直接用空格进行分词GetLoadDataWordId()实现了从文本换成数字编码3. 用占位符填充长度如“Welcome to PyTorch Tutorials” 翻译成 “欢迎来到派托奇教程”英文有4个单词目标中文有9个单词所以需要用占位符填充英文句子填充后英文也是9个单词英文句子:Welcome to PyTorch Tutorials Pad Pad Pad Pad Pad数据集代码class translatDataset : public torch::data::DatasettranslatDataset { public: translatDataset() { wordCount.push_back(GetLoadDataWordId({ Welcome to PyTorch Tutorials Pad Pad Pad Pad Pad,欢 迎 来 到 派 托 奇 教 程 })); wordCount.push_back(GetLoadDataWordId({ Welcome to Machine Learning Pad Pad Pad Pad,欢 迎 来 到 机 器 学 习 })); } torch::optionalsize_t size() const { return wordCount.size(); } torch::data::Exampletorch::Tensor, torch::Tensor get(size_t index) override { auto item wordCount[index]; std::vectorint64_t tmpinput; std::vectorint64_t tmptarget1; for each(auto i in item) { tmpinput.push_back(i.first); tmptarget1.push_back(i.second); } auto input torch::tensor(tmpinput, torch::kLong); auto target torch::tensor(tmptarget1, torch::kLong); return { input, target}; } public: std::vectorWordList wordCount; };4.位置编码1. TORCH_MODULE(PositionalEncoding)主要是为了将模型保存到文件class PositionalEncodingImpl :public torch::nn::Module { public: PositionalEncodingImpl(int64_t d_model, int64_t max_len) { _d_model d_model; _max_len max_len; _posEncode torch::zeros({ _max_len, _d_model }, torch::kFloat32); Encoding(); register_buffer(posEncode, _posEncode); } torch::Tensor forward(torch::Tensor x) { if ((x.dim() 2)) { x x.unsqueeze_(-2); } auto dim x.size(0); // std::cout pos _posEncode.slice(0, 0, dim).sizes() std::endl; //std::cout x x.sizes() std::endl; x x _posEncode.slice(0, 0, dim); return x; } private: void Encoding() { auto pos torch::arange(0, _max_len, torch::kFloat32).reshape({ _max_len, 1 }); auto den_indices torch::arange(0, _d_model, 2, torch::kFloat32); auto den torch::exp(-den_indices * std::log(10000.0f) / _d_model); _posEncode.index_put_({ torch::indexing::Slice(), torch::indexing::Slice(0, _d_model, 2) }, torch::sin(pos * den)); _posEncode.index_put_({ torch::indexing::Slice(), torch::indexing::Slice(1, _d_model, 2) }, torch::cos(pos * den)); _posEncode.unsqueeze_(-2); } public: torch::Tensor _posEncode; int64_t _d_model dim_model; int64_t _max_len max_vocab_len; }; TORCH_MODULE(PositionalEncoding);5.构建模型torch::nn::Transformer没有实现词嵌入和位置编码,必须我们自己手动实现还有补完周边设计如图红色方框实现代码class TranslatorImpl : public torch::nn::Module { public: TranslatorImpl() { src_emb register_module(src_emb, torch::nn::Embedding(torch::nn::EmbeddingOptions(src_vocab_size, dim_model))); tgt_emb_ register_module(tgt_emb, torch::nn::Embedding(torch::nn::EmbeddingOptions(tgt_vocab_size, dim_model))); pos_encoder register_module(pos_encoder, PositionalEncoding(dim_model, max_vocab_len)); torch::nn::TransformerOptions opts; opts.nhead(2); opts.dim_feedforward(dim_feed); opts.num_decoder_layers(1); opts.num_encoder_layers(1); opts.dropout(0.0); opts.d_model(dim_model); transformer register_module(transformer, torch::nn::Transformer(opts)); fc register_module(fc, torch::nn::Linear(dim_model, tgt_vocab_size)); } } TORCH_MODULE(Translator);torch::nn::TransformerOptions 说明为了减少算计量1. 只定义两个多头注意力2.编/解码器各为一个3.Dropout 功能是训练时防止过拟合会丢掉一些数据我们关闭4.前馈网络默认是2048,定义为2565.Transformer的默认维度512定义为1286. 训练模型1. Transformer::forward()的定义参数要求Tensor forward( const Tensor src, /// src: the sequence to the encoder (required). [seq, batch,dim] const Tensor tgt, /// tgt: the sequence to the decoder (required). [seq, batch,dim] const Tensor src_mask {}, const Tensor tgt_mask {}, const Tensor memory_mask {}, const Tensor src_key_padding_mask {}, const Tensor tgt_key_padding_mask {}, const Tensor memory_key_padding_mask {});src和tgt是必须的其他都是可选项本着简单原则 src和tgt数据格式要求张量形状 : [seq, batch,dim]seq数据长度batch 批次大小dim维度如src :“Welcome to PyTorch Tutorials Pad Pad Pad Pad Pad”对应的张量形状:[9,1,128]Transformer::forward()返回值 张量形状[seq, batch,dim]2. 我们用数据集加载数据 的张量形状[batch, seq]也就是原始数据需要换成[seqbatch]然后词嵌入和位置编码最后送到Transformer加全连接层实现代码torch::Tensor forward(torch::Tensor src, torch::Tensor tgt) { //[batch, seq] -- [seq, batch] src src.permute({ 1,0 }); tgt tgt.permute({ 1,0 }); //std::cout input src std::endl; src src_emb-forward(src) * std::sqrt(dim_model); src pos_encoder-forward(src); tgt tgt_emb_-forward(tgt) * std::sqrt(dim_model); tgt pos_encoder-forward(tgt); // tgt src: (seq, batch, dim) auto outs transformer-forward(src, tgt); // outs (seq, batch, dim) outs fc-forward(outs); return outs; }3.注意这部分解码器的输入输出对于解码器要给出开始和结束标记在中文词汇表中用S表示开始E表示结束“Welcome to Machine Learning Pad Pad Pad Pad”---“欢迎来到机器学习” 实际是数字编码用文本更容易理解而已图解注意红色标记构造开始和结束标记符代码std::pairtorch::Tensor, torch::Tensor CreateDecoderInputOutput(torch::Tensor data) { auto E torch::tensor(GetWordId(tgt_vocab, E), torch::kLong).view({ 1,1 }); auto S torch::tensor(GetWordId(tgt_vocab, S), torch::kLong).view({ 1,1 }); auto input torch::cat({ S, data }, 1); auto output torch::cat({ data,E }, 1); return { input ,output }; }4. 训练代码void TrainData(Translator model) { double accuracy 0.03; auto datasetTrain translatDataset().map(torch::data::transforms::Stack()); auto train_data_loader torch::data::make_data_loader(std::move(datasetTrain), torch::data::DataLoaderOptions().batch_size(1)); torch::nn::CrossEntropyLoss loss_fn; torch::optim::Adam optimizer(model-parameters(), torch::optim::AdamOptions(1e-3)); model-train(); std::cout 训练模型 std::endl; for (int i 0; i max_train; i) { float total_loss 0; for (auto item : *train_data_loader) { /// item: [batch, seq] auto [tgtInput, tgtOutput] CreateDecoderInputOutput(item.target); auto tgtOut model-forward(item.data, tgtInput); auto output tgtOut.reshape({ -1, tgt_vocab_size }); optimizer.zero_grad(); auto tgt tgtOutput.squeeze(0); auto loss loss_fn(output, tgt); total_loss loss.itemfloat(); torch::nn::utils::clip_grad_norm_(model-parameters(), 1.0); loss.backward(); optimizer.step(); } if (i % 10 0 || (i 1 max_train)) { std::cout i: i 1 , loss: total_loss std::endl; } if (total_loss accuracy) { std::cout break... , total_loss: total_loss , i: i 1 std::endl; break; } } std::cout std::endl; }7. 预测翻译流程如上图1.英文句子 经过词嵌入和位置编码送入transformer::encoder编码器得到结果 输入到解码器中2.目标(中文)句子 加入开始标记符 经过词嵌入和位置编码送到解码器中得到一个词不是结束符加入到目标(中文)句子 再次 经过词嵌入和位置编码送到解码器中得到一下个词如此循环。解码器输出要经过全连接层。代码torch::Tensor predict(torch::Tensor src) { auto srcemb src_emb-forward(src) * std::sqrt(dim_model); srcemb pos_encoder-forward(srcemb); auto memory transformer-encoder.forward(srcemb); std::vectorint64_t tgtpad GetWordId(tgt_vocab, S); int i 0; while (i tgt_vocab_size * 2) { torch::Tensor tgt torch::tensor(tgtpad, torch::kLong); auto tgt_emb tgt_emb_-forward(tgt) * std::sqrt(dim_model); tgt_emb pos_encoder-forward(tgt_emb); auto out transformer-decoder.forward(tgt_emb, memory); out fc-forward(out).squeeze(-2); auto next_token out.argmax(-1); int64_t key next_token[i].itemint64_t(); tgtpad.push_back(key); //tgtpad.insert(tgtpad.begin(), ); if (E GetWordById(tgt_vocab, key)) { break; } i; } //tgtpad.pop_back(); return torch::tensor(tgtpad, torch::kLong); }3.翻译句子void TestData(Translator model) { model-eval(); std::cout 测试翻译: std::endl; std::vectorstd::string tests; tests.push_back(Welcome to PyTorch Tutorials Pad Pad Pad Pad Pad); tests.push_back(Welcome to Machine Learning Pad Pad Pad Pad); for (auto ch : tests) { auto item GetWordId(src_vocab, ch); auto src torch::tensor(item, torch::kLong); auto result model-predict(src); // std::cout std::regex_replace(ch, std::regex(Pad), ) : ; std::cout ch : ; for (int k 0; k result.numel(); k) { std::cout GetWordById(tgt_vocab, result[k].itemint64_t()) ; } std::cout std::endl; } }运行结果8.问题在预测阶段 我们是不知道 输入与输出的关系 无法使用占位符来补充输入问题点在训练时没有处理掩码占位符​​​​​​下一本章再细讲掩码完整代码#include torch/torch.h #include iostream #include torch/serialize.h #include regex //#include iostream #include fstream #define dim_model 128 #define dim_feed 256 #define max_vocab_len 500 #define max_train 100 #define PadId 0 typedef const std::unordered_map std::string, int64_t TableVocab; typedef std::vectorstd::pairint64_t, int64_t WordList; /// 翻译 /// Welcome to PyTorch Tutorials --- 欢迎来到派托奇教程 /// Welcome to Machine Learning ----- 欢迎来到机器学习 /// /翻译 TableVocab src_vocab { {Pad,PadId}, {Welcome,1}, {to,2}, {PyTorch,3}, {Machine,4}, {Tutorials,5}, {Learning,6 } }; TableVocab tgt_vocab { {Pad,PadId}, {S,1}, {E,2}, {欢,3}, {迎,4}, {来,5}, {到,6}, {派,7}, {托,8}, {奇,9}, {教,10}, {程,11}, {机,12}, {器,13}, {学,14}, {习,15} }; int64_t src_vocab_size src_vocab.size(); int64_t tgt_vocab_size tgt_vocab.size(); std::vectorstd::string Split(const std::string s) { std::vectorstd::string res; std::stringstream ss(s); std::string word; while (ss word) { res.push_back(word); } return res; } std::string GetWordById(TableVocab vocabId, int64_t dataid) { std::string Word { 0 }; for (auto w : vocabId) { if (w.second dataid) { Word w.first; break; } } return Word; } std::vectorint64_t GetWordId(TableVocab vocabId, std::string data) { std::vectorint64_t input; for (auto ch : Split(data)) { input.push_back(vocabId.at(ch)); } return input; } WordList GetLoadDataWordId(std::pairstd::string, std::string data) { std::vectorint64_t input GetWordId(src_vocab, data.first); std::vectorint64_t target GetWordId(tgt_vocab, data.second); WordList item; for (int i 0; i input.size() i target.size(); i) { item.push_back({ input.at(i),target.at(i) }); } return item; } class translatDataset : public torch::data::DatasettranslatDataset { public: translatDataset() { wordCount.push_back(GetLoadDataWordId({ Welcome to PyTorch Tutorials Pad Pad Pad Pad Pad,欢 迎 来 到 派 托 奇 教 程 })); wordCount.push_back(GetLoadDataWordId({ Welcome to Machine Learning Pad Pad Pad Pad,欢 迎 来 到 机 器 学 习 })); } torch::optionalsize_t size() const { return wordCount.size(); } torch::data::Exampletorch::Tensor, torch::Tensor get(size_t index) override { auto item wordCount[index]; std::vectorint64_t tmpinput; std::vectorint64_t tmptarget1; for each(auto i in item) { tmpinput.push_back(i.first); tmptarget1.push_back(i.second); } auto input torch::tensor(tmpinput, torch::kLong); auto target torch::tensor(tmptarget1, torch::kLong); return { input, target }; } public: std::vectorWordList wordCount; }; std::pairtorch::Tensor, torch::Tensor CreateDecoderInputOutput(torch::Tensor data) { auto E torch::tensor(GetWordId(tgt_vocab, E), torch::kLong).view({ 1,1 }); auto S torch::tensor(GetWordId(tgt_vocab, S), torch::kLong).view({ 1,1 }); auto input torch::cat({ S, data }, 1); auto output torch::cat({ data,E }, 1); return { input ,output }; } class PositionalEncodingImpl :public torch::nn::Module { public: PositionalEncodingImpl(int64_t d_model, int64_t max_len) { _d_model d_model; _max_len max_len; _posEncode torch::zeros({ _max_len, _d_model }, torch::kFloat32); Encoding(); register_buffer(posEncode, _posEncode); } torch::Tensor forward(torch::Tensor x) { if ((x.dim() 2)) { x x.unsqueeze_(-2); } auto dim x.size(0); // std::cout pos _posEncode.slice(0, 0, dim).sizes() std::endl; //std::cout x x.sizes() std::endl; x x _posEncode.slice(0, 0, dim); return x; } private: void Encoding() { auto pos torch::arange(0, _max_len, torch::kFloat32).reshape({ _max_len, 1 }); auto den_indices torch::arange(0, _d_model, 2, torch::kFloat32); auto den torch::exp(-den_indices * std::log(10000.0f) / _d_model); _posEncode.index_put_({ torch::indexing::Slice(), torch::indexing::Slice(0, _d_model, 2) }, torch::sin(pos * den)); _posEncode.index_put_({ torch::indexing::Slice(), torch::indexing::Slice(1, _d_model, 2) }, torch::cos(pos * den)); _posEncode.unsqueeze_(-2); } public: torch::Tensor _posEncode; int64_t _d_model dim_model; int64_t _max_len max_vocab_len; }; TORCH_MODULE(PositionalEncoding); class TranslatorImpl : public torch::nn::Module { public: TranslatorImpl() { src_emb register_module(src_emb, torch::nn::Embedding(torch::nn::EmbeddingOptions(src_vocab_size, dim_model))); tgt_emb_ register_module(tgt_emb, torch::nn::Embedding(torch::nn::EmbeddingOptions(tgt_vocab_size, dim_model))); pos_encoder register_module(pos_encoder, PositionalEncoding(dim_model, max_vocab_len)); torch::nn::TransformerOptions opts; opts.nhead(2); opts.dim_feedforward(dim_feed); opts.num_decoder_layers(1); opts.num_encoder_layers(1); opts.dropout(0.0); opts.d_model(dim_model); transformer register_module(transformer, torch::nn::Transformer(opts)); fc register_module(fc, torch::nn::Linear(dim_model, tgt_vocab_size)); } torch::Tensor forward(torch::Tensor src, torch::Tensor tgt) { //[batch, seq] -- [seq, batch] src src.permute({ 1,0 }); tgt tgt.permute({ 1,0 }); //std::cout input src std::endl; src src_emb-forward(src) * std::sqrt(dim_model); src pos_encoder-forward(src); tgt tgt_emb_-forward(tgt) * std::sqrt(dim_model); tgt pos_encoder-forward(tgt); // tgt src: (seq, batch, dim) auto outs transformer-forward(src, tgt); outs fc-forward(outs); return outs; } torch::Tensor predict(torch::Tensor src) { auto srcemb src_emb-forward(src) * std::sqrt(dim_model); srcemb pos_encoder-forward(srcemb); auto memory transformer-encoder.forward(srcemb); std::vectorint64_t tgtpad GetWordId(tgt_vocab, S); int i 0; while (i tgt_vocab_size * 2) { torch::Tensor tgt torch::tensor(tgtpad, torch::kLong); auto tgt_emb tgt_emb_-forward(tgt) * std::sqrt(dim_model); tgt_emb pos_encoder-forward(tgt_emb); auto out transformer-decoder.forward(tgt_emb, memory); out fc-forward(out).squeeze(-2); auto next_token out.argmax(-1); int64_t key next_token[i].itemint64_t(); tgtpad.push_back(key); //tgtpad.insert(tgtpad.begin(), ); if (E GetWordById(tgt_vocab, key)) { break; } i; } //tgtpad.pop_back(); return torch::tensor(tgtpad, torch::kLong); } torch::nn::Embedding src_emb{ nullptr }; torch::nn::Embedding tgt_emb_{ nullptr }; PositionalEncoding pos_encoder{ nullptr }; torch::nn::Transformer transformer{ nullptr }; torch::nn::Linear fc{ nullptr }; }; TORCH_MODULE(Translator); void TestData(Translator model); void TrainData(Translator model); void TransformerMain() { torch::manual_seed(4); std::string model_path translator_model.pt; Translator model; std::ifstream filem(model_path); bool bmodel filem.is_open(); if (!bmodel||1) { TrainData(model); torch::save(model, model_path); } else { torch::load(model, model_path); std::cout load model .... std::endl; } filem.close(); TestData(model); } void TrainData(Translator model) { double accuracy 0.03; auto datasetTrain translatDataset().map(torch::data::transforms::Stack()); auto train_data_loader torch::data::make_data_loader(std::move(datasetTrain), torch::data::DataLoaderOptions().batch_size(1)); torch::nn::CrossEntropyLoss loss_fn; torch::optim::Adam optimizer(model-parameters(), torch::optim::AdamOptions(1e-3)); model-train(); std::cout 训练模型 std::endl; for (int i 0; i max_train; i) { float total_loss 0; for (auto item : *train_data_loader) { /// item: [batch, seq] auto [tgtInput, tgtOutput] CreateDecoderInputOutput(item.target); auto tgtOut model-forward(item.data, tgtInput); auto output tgtOut.reshape({ -1, tgt_vocab_size }); optimizer.zero_grad(); auto tgt tgtOutput.squeeze(0); auto loss loss_fn(output, tgt); total_loss loss.itemfloat(); torch::nn::utils::clip_grad_norm_(model-parameters(), 1.0); loss.backward(); optimizer.step(); } if (i % 10 0 || (i 1 max_train)) { std::cout i: i 1 , loss: total_loss std::endl; } if (total_loss accuracy) { std::cout break... , total_loss: total_loss , i: i 1 std::endl; break; } } std::cout std::endl; } void TestData(Translator model) { model-eval(); std::cout 测试翻译: std::endl; std::vectorstd::string tests; //tests.push_back(Welcome); //tests.push_back(Welcome to); //tests.push_back(Welcome to PyTorch); //tests.push_back(Welcome to Machine); //tests.push_back(Welcome to PyTorch Tutorials); //tests.push_back(Welcome to Machine Learning); tests.push_back(Welcome to PyTorch Tutorials Pad Pad Pad Pad Pad); tests.push_back(Welcome to Machine Learning Pad Pad Pad Pad); for (auto ch : tests) { auto item GetWordId(src_vocab, ch); auto src torch::tensor(item, torch::kLong); auto result model-predict(src); // std::cout std::regex_replace(ch, std::regex(Pad), ) : ; std::cout ch : ; for (int k 0; k result.numel(); k) { std::cout GetWordById(tgt_vocab, result[k].itemint64_t()) ; } std::cout std::endl; } }感谢大家的支持如要问题欢迎提问指正。