264 lines
10 KiB
Python
264 lines
10 KiB
Python
import copy
|
|
import torch
|
|
import typing
|
|
import datasets
|
|
import itertools
|
|
import transformers
|
|
from dataclasses import dataclass
|
|
from torch.nn.utils.rnn import pad_sequence
|
|
|
|
from arguments import DataArguments
|
|
|
|
IGNORE_INDEX = -100
|
|
|
|
|
|
def group_texts(examples, block_size: int):
|
|
# Concatenate all texts.
|
|
concatenated_examples = {k: list(itertools.chain(*examples[k])) for k in examples.keys()}
|
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
|
# customize this part to your needs.
|
|
if total_length >= block_size:
|
|
total_length = (total_length // block_size) * block_size
|
|
# Split by chunks of max_len.
|
|
result = {k: [t[i: i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items()}
|
|
result["labels"] = result["input_ids"].copy()
|
|
return result
|
|
|
|
|
|
@dataclass
|
|
class DataCollatorForCausalLMText(object):
|
|
tokenizer: transformers.PreTrainedTokenizer
|
|
max_len: int
|
|
|
|
def __call__(self, instances: typing.Sequence[typing.Dict]) -> typing.Dict[str, torch.Tensor]:
|
|
# Extract elements
|
|
examples = [f"{self.tokenizer.bos_token}{example['text']}{self.tokenizer.eos_token}" for example in instances]
|
|
# Tokenize
|
|
tokenized_examples = self.tokenizer(
|
|
examples,
|
|
max_length=self.max_len,
|
|
truncation=True,
|
|
add_special_tokens=False,
|
|
)
|
|
# Build the input and labels for causal LM
|
|
input_ids = []
|
|
for tokenized_example in tokenized_examples['input_ids']:
|
|
input_ids.append(torch.tensor(tokenized_example))
|
|
# Apply padding
|
|
padding_value = None
|
|
if self.tokenizer.pad_token_id is not None:
|
|
padding_value = self.tokenizer.pad_token_id
|
|
elif self.tokenizer.eos_token_id is not None:
|
|
padding_value = self.tokenizer.eos_token_id
|
|
else:
|
|
raise RuntimeError("Model dose not have a pad or eos token")
|
|
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=padding_value)
|
|
|
|
data_dict = {
|
|
'input_ids': input_ids,
|
|
'attention_mask': input_ids.ne(padding_value),
|
|
'labels': input_ids
|
|
}
|
|
return data_dict
|
|
|
|
|
|
@dataclass
|
|
class DataCollatorForCausalLMs2s(object):
|
|
tokenizer: transformers.PreTrainedTokenizer
|
|
source_max_len: int
|
|
target_max_len: int
|
|
train_on_source: bool
|
|
predict_with_generate: bool
|
|
|
|
def __call__(self, instances: typing.Sequence[typing.Dict]) -> typing.Dict[str, torch.Tensor]:
|
|
# Extract elements
|
|
sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances]
|
|
targets = [f"{example['output']}{self.tokenizer.eos_token}" for example in instances]
|
|
# Tokenize
|
|
tokenized_sources_with_prompt = self.tokenizer(
|
|
sources,
|
|
max_length=self.source_max_len,
|
|
truncation=True,
|
|
add_special_tokens=False,
|
|
)
|
|
tokenized_targets = self.tokenizer(
|
|
targets,
|
|
max_length=self.target_max_len,
|
|
truncation=True,
|
|
add_special_tokens=False,
|
|
)
|
|
# Build the input and labels for causal LM
|
|
input_ids = []
|
|
labels = []
|
|
for tokenized_source, tokenized_target in zip(
|
|
tokenized_sources_with_prompt['input_ids'],
|
|
tokenized_targets['input_ids']
|
|
):
|
|
if not self.predict_with_generate:
|
|
input_ids.append(torch.tensor(tokenized_source + tokenized_target))
|
|
if not self.train_on_source:
|
|
labels.append(
|
|
torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
|
|
)
|
|
else:
|
|
labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target)))
|
|
else:
|
|
input_ids.append(torch.tensor(tokenized_source))
|
|
# Apply padding
|
|
padding_value = None
|
|
if self.tokenizer.pad_token_id is not None:
|
|
padding_value = self.tokenizer.pad_token_id
|
|
elif self.tokenizer.eos_token_id is not None:
|
|
padding_value = self.tokenizer.eos_token_id
|
|
else:
|
|
raise RuntimeError("Model dose not have a pad or eos token")
|
|
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=padding_value)
|
|
labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None
|
|
data_dict = {
|
|
'input_ids': input_ids,
|
|
'attention_mask': input_ids.ne(padding_value),
|
|
}
|
|
if labels is not None:
|
|
data_dict['labels'] = labels
|
|
return data_dict
|
|
|
|
|
|
def create_data_module_s2s(tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments, do_train, do_eval, do_predict) -> typing.Dict:
|
|
try:
|
|
dataset = datasets.Dataset.from_json(path_or_paths=data_args.dataset)
|
|
except FileNotFoundError as ex:
|
|
raise ValueError(f"Error loading dataset from {data_args.dataset}, {ex}")
|
|
|
|
if do_eval or do_predict:
|
|
if 'eval' in dataset:
|
|
eval_dataset = dataset['eval']
|
|
else:
|
|
print('Splitting train dataset in train and validation according to `eval_dataset_size`')
|
|
dataset = dataset.train_test_split(
|
|
test_size=data_args.eval_dataset_size, shuffle=True, seed=42
|
|
)
|
|
eval_dataset = dataset['test']
|
|
eval_dataset = eval_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
|
|
|
|
if 'train' in dataset:
|
|
train_dataset = dataset['train']
|
|
else:
|
|
train_dataset = dataset
|
|
|
|
train_dataset = train_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
|
|
|
|
data_collator = DataCollatorForCausalLMs2s(
|
|
tokenizer=tokenizer,
|
|
source_max_len=data_args.source_max_len,
|
|
target_max_len=data_args.target_max_len,
|
|
train_on_source=data_args.train_on_source,
|
|
predict_with_generate=False # args.predict_with_generate,
|
|
)
|
|
|
|
return dict(
|
|
train_dataset=train_dataset if do_train else None,
|
|
eval_dataset=eval_dataset if do_eval else None,
|
|
predict_dataset=eval_dataset if do_predict else None,
|
|
data_collator=data_collator
|
|
)
|
|
|
|
|
|
def create_data_module_hub(tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments, do_train, do_eval, do_predict) -> typing.Dict:
|
|
try:
|
|
dataset = datasets.load_dataset(data_args.dataset)
|
|
except FileNotFoundError as ex:
|
|
raise ValueError(f"Error loading dataset from {data_args.dataset}, {ex}")
|
|
|
|
if do_eval or do_predict:
|
|
if 'eval' in dataset:
|
|
eval_dataset = dataset['eval']
|
|
else:
|
|
print('Splitting train dataset in train and validation according to `eval_dataset_size`')
|
|
dataset = dataset.train_test_split(
|
|
test_size=data_args.eval_dataset_size, shuffle=True, seed=42
|
|
)
|
|
eval_dataset = dataset['test']
|
|
|
|
if 'train' in dataset:
|
|
train_dataset = dataset['train']
|
|
else:
|
|
train_dataset = dataset
|
|
|
|
data_collator = DataCollatorForCausalLMText(
|
|
tokenizer=tokenizer,
|
|
max_len=data_args.source_max_len,
|
|
)
|
|
|
|
return dict(
|
|
train_dataset=train_dataset if do_train else None,
|
|
eval_dataset=eval_dataset if do_eval else None,
|
|
predict_dataset=eval_dataset if do_predict else None,
|
|
data_collator=data_collator
|
|
)
|
|
|
|
|
|
def create_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments, do_train: bool, do_eval: bool, do_predict: bool) -> typing.Dict:
|
|
try:
|
|
dataset = datasets.load_dataset('text', data_files={'train': [data_args.dataset]})
|
|
except FileNotFoundError as ex:
|
|
raise ValueError(f"Error loading dataset from {data_args.dataset}, {ex}")
|
|
|
|
if data_args.block_size > tokenizer.model_max_length:
|
|
raise ValueError(f"Block size of {data_args.block_size} is larger than the maximum size supported by the model: {tokenizer.model_max_length}")
|
|
|
|
def add_newline_fn(example):
|
|
example['text'] = example['text'] + '\n'
|
|
return example
|
|
dataset = dataset.map(add_newline_fn)
|
|
|
|
eval_dataset = None
|
|
if do_eval or do_predict:
|
|
if 'eval' in dataset:
|
|
eval_dataset = dataset['eval']
|
|
else:
|
|
print('Splitting train dataset in train and validation according to `eval_dataset_size`')
|
|
dataset = dataset['train'].train_test_split(
|
|
test_size=data_args.eval_dataset_size, shuffle=True, seed=42
|
|
)
|
|
eval_dataset = dataset['test']
|
|
|
|
if 'train' in dataset:
|
|
train_dataset = dataset['train']
|
|
else:
|
|
train_dataset = dataset
|
|
|
|
train_dataset_tokenized = train_dataset.map(
|
|
lambda example: tokenizer(example['text']),
|
|
batched=True,
|
|
remove_columns='text',
|
|
num_proc=32,
|
|
load_from_cache_file=True)
|
|
train_dataset_tokenized = train_dataset_tokenized.map(
|
|
lambda example: group_texts(example, data_args.block_size),
|
|
batched=True,
|
|
num_proc=32,
|
|
load_from_cache_file=True,
|
|
desc=f"Grouping texts in chunks of {data_args.block_size}")
|
|
|
|
eval_dataset_tokenized = None
|
|
if eval_dataset is not None:
|
|
eval_dataset_tokenized = eval_dataset.map(
|
|
lambda example: tokenizer(example['text']),
|
|
batched=True,
|
|
remove_columns='text',
|
|
num_proc=32)
|
|
eval_dataset_tokenized = eval_dataset_tokenized.map(
|
|
lambda example: group_texts(example, data_args.block_size),
|
|
batched=True,
|
|
num_proc=32,
|
|
load_from_cache_file=True,
|
|
desc=f"Grouping texts in chunks of {data_args.block_size}")
|
|
|
|
return dict(
|
|
train_dataset=train_dataset_tokenized if do_train else None,
|
|
eval_dataset=eval_dataset_tokenized if do_eval else None,
|
|
predict_dataset=eval_dataset_tokenized if do_predict else None,
|
|
data_collator=transformers.default_data_collator
|
|
)
|