simple text classification model using pytorch in python

To build a simple text classification model using PyTorch in Python, follow these steps:

  1. Import the necessary libraries:
main.py
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data
97 chars
5 lines
  1. Define the fields for text preprocessing:
main.py
TEXT = data.Field(tokenize='spacy', lower=True)
LABEL = data.LabelField(dtype=torch.float)
91 chars
3 lines
  1. Load the dataset:
main.py
train_data, test_data = data.TabularDataset.splits(
    path='path/to/dataset', train='train.csv', test='test.csv', format='csv',
    fields=[('text', TEXT), ('label', LABEL)]
)
178 chars
5 lines
  1. Build the vocabulary and create data iterators:
main.py
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=64,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True
)
273 chars
9 lines
  1. Define the model architecture:
main.py
class TextClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = self.dropout(hidden)
        return self.fc(hidden.squeeze(0))
655 chars
15 lines
  1. Initialize the model:
main.py
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1
model = TextClassifier(input_dim, embedding_dim, hidden_dim, output_dim)
153 chars
6 lines
  1. Define the loss function and optimizer:
main.py
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())
78 chars
3 lines
  1. Train the model:
main.py
def train(model, iterator, optimizer, criterion):
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        loss.backward()
        optimizer.step()

def evaluate(model, iterator, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            total_loss += loss.item()
    return total_loss / len(iterator)

EPOCHS = 10
for epoch in range(EPOCHS):
    train(model, train_iterator, optimizer, criterion)
    loss = evaluate(model, test_iterator, criterion)
    print(f'Epoch: {epoch+1}, Test Loss: {loss:.4f}')
801 chars
25 lines
  1. Use the trained model for predictions:
main.py
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()
331 chars
9 lines

Now you have a simple text classification model using PyTorch in Python. You can modify the architecture, training parameters, and preprocessing steps to suit your specific needs.

gistlibby LogSnag