176 lines
5.4 KiB
Python
176 lines
5.4 KiB
Python
import os
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
os.environ["WANDB_DISABLED"]= "true"
|
|
import torch
|
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
import pickle
|
|
|
|
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score, average_precision_score, balanced_accuracy_score
|
|
from transformers import BartForConditionalGeneration, BartConfig, Trainer, TrainingArguments, \
|
|
AutoModelForSequenceClassification, T5ForConditionalGeneration, BartTokenizer, AutoTokenizer, \
|
|
BartForSequenceClassification, PLBartForSequenceClassification, RobertaForMaskedLM, RobertaConfig
|
|
from transformers import PreTrainedTokenizerFast
|
|
import numpy as np
|
|
import random
|
|
import evaluate
|
|
import gc
|
|
from GPUtil import showUtilization as gpu_usage
|
|
from numba import cuda
|
|
import logging
|
|
from torch import nn
|
|
from transformers import Trainer
|
|
|
|
|
|
torch.cuda.set_device(1)
|
|
print(device)
|
|
##free GPU cache
|
|
def free_gpu_cache():
|
|
print("Initial GPU Usage")
|
|
gpu_usage()
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
cuda.select_device(0)
|
|
cuda.close()
|
|
cuda.select_device(0)
|
|
print("GPU Usage after emptying the cache")
|
|
gpu_usage()
|
|
|
|
free_gpu_cache()
|
|
|
|
##set seed
|
|
def set_seed(seed):
|
|
random.seed(seed)
|
|
np.random.seed(seed)
|
|
torch.manual_seed(seed)
|
|
if torch.cuda.is_available():
|
|
torch.cuda.manual_seed_all(seed)
|
|
set_seed(123)
|
|
|
|
##set tokenizer
|
|
tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
|
|
tokenizer.add_special_tokens({'bos_token': '<s>'})
|
|
tokenizer.add_special_tokens({'eos_token': '</s>'})
|
|
tokenizer.add_special_tokens({'unk_token': '<unk>'})
|
|
tokenizer.add_special_tokens({'mask_token': '<mask>'})
|
|
tokenizer.add_special_tokens({'pad_token': '<pad>'})
|
|
|
|
|
|
#
|
|
# config = RobertaConfig(
|
|
# attention_probs_dropout_prob=0.1,
|
|
# bos_token_id=0,
|
|
# eos_token_id=2,
|
|
# hidden_act="gelu",
|
|
# hidden_dropout_prob= 0.1,
|
|
# hidden_size=768,
|
|
# initializer_range=0.02,
|
|
# intermediate_size=3072,
|
|
# layer_norm_eps=1e-12,
|
|
# max_position_embeddings=514,
|
|
# model_type="roberta",
|
|
# num_attention_heads=12,
|
|
# num_hidden_layers=12,
|
|
# pad_token_id=1,
|
|
# type_vocab_size=1,
|
|
# vocab_size=5000,
|
|
# )
|
|
# ##set model config
|
|
#model = RobertaForMaskedLM(config)
|
|
model = RobertaForMaskedLM.from_pretrained("pretraining/checkpoint")
|
|
model.to(device)
|
|
print(model.num_parameters())
|
|
print(model.config)
|
|
|
|
|
|
#load training and validation dataset
|
|
with open('train_data_tc_task.pkl','rb') as f:
|
|
train_dataset = pickle.load(f)
|
|
with open('valid_data_tc_task.pkl','rb') as f:
|
|
valid_dataset = pickle.load(f)
|
|
final_train_data = []
|
|
#final_valid_data = []
|
|
#print(train_dataset[0]['input_ids'])
|
|
for i in range(0,len(train_dataset)):
|
|
final_train_data.append({'input_ids': torch.tensor(train_dataset[i]['input_ids'][0]), 'labels': torch.tensor(train_dataset[i]['labels'][0])})
|
|
for i in range(0,len(valid_dataset)):
|
|
final_valid_data.append({'input_ids': torch.tensor(valid_dataset[i]['input_ids'][0]), 'labels': torch.tensor(valid_dataset[i]['labels'][0])})
|
|
|
|
print(len(final_train_data[0]['input_ids'][0]))
|
|
|
|
|
|
def preprocess_logits_for_metrics(logits, labels):
|
|
if isinstance(logits, tuple):
|
|
# Depending on the model and config, logits may contain extra tensors,
|
|
# like past_key_values, but logits always come first
|
|
logits = logits[0]
|
|
return logits
|
|
|
|
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir="/model/",
|
|
evaluation_strategy="no",
|
|
logging_strategy="steps",
|
|
save_strategy="steps",
|
|
learning_rate=5e-5,
|
|
per_device_train_batch_size=16,
|
|
seed=123,
|
|
save_steps=10000,
|
|
logging_steps=1000,
|
|
max_steps=100000,
|
|
optim="adamw_torch",
|
|
fp16=True,
|
|
report_to = 'none'
|
|
)
|
|
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=final_train_data,
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
##validation
|
|
def validation(i,path):
|
|
#print("checkpoint ",i)
|
|
model = RobertaForMaskedLM.from_pretrained('pretraining/checkpoint')
|
|
model.to(device)
|
|
#print(model.num_parameters())
|
|
|
|
results = []
|
|
no = 0
|
|
for data in final_valid_data:
|
|
no = no +1
|
|
print(no)
|
|
input = {'input_ids': data['input_ids'].cuda()}
|
|
labels = data['labels'][0].cuda()
|
|
with torch.no_grad():
|
|
output = model(**input).logits
|
|
mask_token_index = (input['input_ids'] == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
|
|
|
|
#print(output)
|
|
a, idx1 = torch.sort(output[0, mask_token_index], descending=True)
|
|
#print(a)
|
|
#print(list(np.round(torch.softmax(a,dim=1).cpu().numpy(),5)))
|
|
a = torch.softmax(a,dim=1).cpu().numpy()
|
|
predicted_token_id_5 = idx1[:,:5]
|
|
predicted_token_ratio_5 = a[:,:5]
|
|
ratios = []
|
|
for items in predicted_token_ratio_5:
|
|
temp = []
|
|
for item in items:
|
|
temp.append(round(item,5))
|
|
ratios.append(temp)
|
|
#print(ratios)
|
|
|
|
results.append({'id': no, 'input': data['input_ids'].cpu().numpy(), 'labels': labels[mask_token_index].cpu().numpy(), 'predictions': predicted_token_id_5.cpu().numpy(), 'ratios': ratios})
|
|
#print(results)
|
|
with open('results.pkl', 'wb') as f:
|
|
pickle.dump(results, f)
|
|
|
|
|
|
print("Roberta:")
|
|
testing(20,"pretraining") |