82 lines
		
	
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			82 lines
		
	
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| 
 | |
| # QRotaryTraining - A novel method for fully training all parameters of large
 | |
| # language models (llms) while using less device memory than traditional methods.
 | |
| # Copyright (C) 2024 Carl Philipp Klemm
 | |
| #
 | |
| # This file is part of QRotaryTraining.
 | |
| #
 | |
| # QRotaryTraining is free software: you can redistribute it and/or modify
 | |
| # it under the terms of the GNU General Public License as published by
 | |
| # the Free Software Foundation, either version 3 of the License, or
 | |
| # (at your option) any later version.
 | |
| #
 | |
| # QRotaryTraining is distributed in the hope that it will be useful,
 | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| # GNU General Public License for more details.
 | |
| #
 | |
| # You should have received a copy of the GNU General Public License
 | |
| # along with QRotaryTraining.  If not, see <http://www.gnu.org/licenses/>.
 | |
| 
 | |
| import transformers
 | |
| 
 | |
| from arguments import ModelArguments
 | |
| 
 | |
| 
 | |
| DEFAULT_PAD_TOKEN = "[PAD]"
 | |
| 
 | |
| 
 | |
| def smart_tokenizer_and_embedding_resize(
 | |
|     special_tokens_dict: dict,
 | |
|     tokenizer: transformers.PreTrainedTokenizer,
 | |
|     model: transformers.PreTrainedModel,
 | |
| ):
 | |
|     """Resize tokenizer and embedding.
 | |
| 
 | |
|     Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
 | |
|     """
 | |
|     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
 | |
|     model.resize_token_embeddings(len(tokenizer))
 | |
| 
 | |
|     if num_new_tokens > 0:
 | |
|         input_embeddings_data = model.get_input_embeddings().weight.data
 | |
|         output_embeddings_data = model.get_output_embeddings().weight.data
 | |
| 
 | |
|         input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
 | |
|         output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
 | |
| 
 | |
|         input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
 | |
|         output_embeddings_data[-num_new_tokens:] = output_embeddings_avg
 | |
| 
 | |
| 
 | |
| def get_tokenizer(model, cache_dir, model_args: ModelArguments):
 | |
|     tokenizer_path = model_args.tokenizer if model_args.tokenizer is not None else model_args.model_name_or_path
 | |
|     print(f'Tokenizer: {tokenizer_path}')
 | |
|     tokenizer = transformers.AutoTokenizer.from_pretrained(
 | |
|         tokenizer_path,
 | |
|         cache_dir=cache_dir,
 | |
|         padding_side="right",
 | |
|         use_fast=False,
 | |
|         tokenizer_type='llama' if 'llama' in model_args.model_name_or_path else None,
 | |
|         trust_remote_code=model_args.trust_remote_code
 | |
|     )
 | |
|     if tokenizer._pad_token is None and not model_args.noresize:
 | |
|         smart_tokenizer_and_embedding_resize(
 | |
|             special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
 | |
|             tokenizer=tokenizer,
 | |
|             model=model,
 | |
|         )
 | |
|     if 'llama' in model_args.model_name_or_path or isinstance(tokenizer, transformers.LlamaTokenizer):
 | |
|         # LLaMA tokenizer may not have correct special tokens set.
 | |
|         # Check and add them if missing to prevent them from being parsed into different tokens.
 | |
|         # Note that these are present in the vocabulary.
 | |
|         # Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.
 | |
|         print('Adding special tokens.')
 | |
|         tokenizer.add_special_tokens({
 | |
|             "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
 | |
|             "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
 | |
|             "unk_token": tokenizer.convert_ids_to_tokens(
 | |
|                 model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id
 | |
|             ),
 | |
|         })
 | |
|     return tokenizer
 |