# QRotaryTraining - A novel method for fully training all parameters of large # language models (llms) while using less device memory than traditional methods. # Copyright (C) 2024 Carl Philipp Klemm # # This file is part of QRotaryTraining. # # QRotaryTraining is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # QRotaryTraining is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with QRotaryTraining. If not, see . import transformers from arguments import ModelArguments DEFAULT_PAD_TOKEN = "[PAD]" def smart_tokenizer_and_embedding_resize( special_tokens_dict: dict, tokenizer: transformers.PreTrainedTokenizer, model: transformers.PreTrainedModel, ): """Resize tokenizer and embedding. Note: This is the unoptimized version that may make your embedding size not be divisible by 64. """ num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) if num_new_tokens > 0: input_embeddings_data = model.get_input_embeddings().weight.data output_embeddings_data = model.get_output_embeddings().weight.data input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True) output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True) input_embeddings_data[-num_new_tokens:] = input_embeddings_avg output_embeddings_data[-num_new_tokens:] = output_embeddings_avg def get_tokenizer(model, cache_dir, model_args: ModelArguments): tokenizer_path = model_args.tokenizer if model_args.tokenizer is not None else model_args.model_name_or_path print(f'Tokenizer: {tokenizer_path}') tokenizer = transformers.AutoTokenizer.from_pretrained( tokenizer_path, cache_dir=cache_dir, padding_side="right", use_fast=False, tokenizer_type='llama' if 'llama' in model_args.model_name_or_path else None, trust_remote_code=model_args.trust_remote_code ) if tokenizer._pad_token is None and not model_args.noresize: smart_tokenizer_and_embedding_resize( special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), tokenizer=tokenizer, model=model, ) if 'llama' in model_args.model_name_or_path or isinstance(tokenizer, transformers.LlamaTokenizer): # LLaMA tokenizer may not have correct special tokens set. # Check and add them if missing to prevent them from being parsed into different tokens. # Note that these are present in the vocabulary. # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token. print('Adding special tokens.') tokenizer.add_special_tokens({ "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id), "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id), "unk_token": tokenizer.convert_ids_to_tokens( model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id ), }) return tokenizer