gistlib - simple text classification model using pytorch in python

To build a simple text classification model using PyTorch in Python, follow these steps:

Import the necessary libraries:


main.py
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data
97 chars
5 lines

Define the fields for text preprocessing:


main.py
TEXT = data.Field(tokenize='spacy', lower=True)
LABEL = data.LabelField(dtype=torch.float)
91 chars
3 lines

Load the dataset:


main.py
train_data, test_data = data.TabularDataset.splits(
    path='path/to/dataset', train='train.csv', test='test.csv', format='csv',
    fields=[('text', TEXT), ('label', LABEL)]
)
178 chars
5 lines

Build the vocabulary and create data iterators:


main.py
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=64,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True
)
273 chars
9 lines

Define the model architecture:


main.py
class TextClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = self.dropout(hidden)
        return self.fc(hidden.squeeze(0))
655 chars
15 lines

Initialize the model:


main.py
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1
model = TextClassifier(input_dim, embedding_dim, hidden_dim, output_dim)
153 chars
6 lines

Define the loss function and optimizer:


main.py
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())
78 chars
3 lines

Train the model:


main.py
def train(model, iterator, optimizer, criterion):
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        loss.backward()
        optimizer.step()

def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            total_loss += loss.item()
    return total_loss / len(iterator)

EPOCHS = 10
for epoch in range(EPOCHS):
    train(model, train_iterator, optimizer, criterion)
    loss = evaluate(model, test_iterator, criterion)
    print(f'Epoch: {epoch+1}, Test Loss: {loss:.4f}')
801 chars
25 lines

Use the trained model for predictions:


main.py
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()
331 chars
9 lines

Now you have a simple text classification model using PyTorch in Python. You can modify the architecture, training parameters, and preprocessing steps to suit your specific needs.

simple text classification model using pytorch in python

similar python code snippets

related categories