How To

Huggingface Transformers

Implementing transformer models for natural language processing

pip install transformers
pip install sentencepiece
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration
from transformers import TFT5Config, TFT5Tokenizer, TFT5ForConditionalGeneration
from google.colab import drive
drive.mount('/content/drive')
model_dir = '/content/drive/T5model'config = T5Config()
model = T5ForConditionalGeneration.from_pretrained(
'google/t5-small-ssm',
config=config
)
model.save_pretrained(model_dir)
config = T5Config()
model = T5ForConditionalGeneration.from_pretrained(
model_dir,
config=config
)
config = T5Config(vocab_size=250112, num_layers=8, num_heads=6)
try:
tokenizer = T5Tokenizer.from_pretrained(model_dir)
except:
tokenizer = T5Tokenizer.from_pretrained('google/t5-small-ssm')
tokenizer.save_pretrained(model_dir)
tokens = ['en', 'fr', 'zh']
tokenizer.add_special_tokens({'additional_special_tokens' : tokens})
[int, int, ...] = tokenizer.encode(str)
str = tokenizer.decode([int, int, ...])
[[int...], [int...], ...] = tokenizer.batch_encode([str, str, ...])
[str, str, ...] = tokenizer.batch_decode([[int...], [int...], ...])
source = [str, str, ...]
target = [str, str, ...]
encodings = tokenizer.prepare_seq2seq_batch(source, target)
target = [tokenizer.pad_token + ' ' + ' '.join(x) for x in target]...encodings = tokenizer.prepare_seq2seq_batch(source, target)
encoder_input_ids = encodings.input_ids
encoder_attention_mask = encodings.attention_mask
decoder_input_ids = encodings.labels[:, :-1].clone() # skip last
labels = encodings.labels[:, 1:].clone() # skip first
labels[labels[:, :] == tokenizer.pad_token_id] = -100
outputs = model(
input_ids = encoder_input_ids,
attention_mask = encoder_attention_mask,
decoder_input_ids = decoder_input_ids,
labels = labels,
)
loss = outputs[0]
optimizer = AdamW(model.parameters(), lr=5e-5)device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
for epoch in epochs:
for batch in dataloader:
batch.to(device)
model.train()
optimizer.zero_grad()
outputs = model(
input_ids = batch.encoder_input_ids,
attention_mask = batch.encoder_attention_mask,
decoder_input_ids = batch.decoder_input_ids,
labels = batch.labels,
)
loss = outputs[0]
loss.backward()
optimizer.step()
model.eval()
with torch.no_grad():
output = model(
...
)
model.save_pretrained(new_model_dir)
predicted_tokens = model.generate(
encoder_input_ids,
decoder_start_token_id=tokenizer.pad_token_id,
num_beams=5,
early_stopping=True,
max_length=MAX_LEN
)
predicted_strings = tokenizer.batch_decode(
predicted_tokens,
skip_special_tokens=True
)

Computer Scientist

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store