# QRotaryTraining - A novel method for fully training all parameters of large
# language models (llms) while using less device memory than traditional methods.
# Copyright (C) 2024 Carl Philipp Klemm
#
# This file is part of QRotaryTraining.
#
# QRotaryTraining is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# QRotaryTraining is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with QRotaryTraining. If not, see .
import transformers
from arguments import ModelArguments
DEFAULT_PAD_TOKEN = "[PAD]"
def smart_tokenizer_and_embedding_resize(
special_tokens_dict: dict,
tokenizer: transformers.PreTrainedTokenizer,
model: transformers.PreTrainedModel,
):
"""Resize tokenizer and embedding.
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
if num_new_tokens > 0:
input_embeddings_data = model.get_input_embeddings().weight.data
output_embeddings_data = model.get_output_embeddings().weight.data
input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
output_embeddings_data[-num_new_tokens:] = output_embeddings_avg
def get_tokenizer(model, cache_dir, model_args: ModelArguments):
tokenizer_path = model_args.tokenizer if model_args.tokenizer is not None else model_args.model_name_or_path
print(f'Tokenizer: {tokenizer_path}')
tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer_path,
cache_dir=cache_dir,
padding_side="right",
use_fast=False,
tokenizer_type='llama' if 'llama' in model_args.model_name_or_path else None,
trust_remote_code=model_args.trust_remote_code
)
if tokenizer._pad_token is None and not model_args.noresize:
smart_tokenizer_and_embedding_resize(
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
tokenizer=tokenizer,
model=model,
)
if 'llama' in model_args.model_name_or_path or isinstance(tokenizer, transformers.LlamaTokenizer):
# LLaMA tokenizer may not have correct special tokens set.
# Check and add them if missing to prevent them from being parsed into different tokens.
# Note that these are present in the vocabulary.
# Note also that `model.config.pad_token_id` is 0 which corresponds to `` token.
print('Adding special tokens.')
tokenizer.add_special_tokens({
"eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
"bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
"unk_token": tokenizer.convert_ids_to_tokens(
model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id
),
})
return tokenizer