来源:雪球App,作者: DrChuck,(https://xueqiu.com/1039527614/311609379)
手把手教你入门GPT
有幸入选雪球2024年度十大影响力用户提名,插个投票链接在此,感谢大家支持DrChuck:网页链接
2022年底,GPT3横空出世,改变了整个世界。可惜OpenAI不再Open,模型从此闭源。只有GPT2的权重还开源,在抱抱脸HuggingFace上就有。虽然是用于自然语言处理,预测下一个词,但稍微改改就能拿来预测股票。这里改成根据前十天的股价预测之后一天是否会新高,给大家一个参考,期待看到更多魔改。
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from transformers import GPT2Model, GPT2Config
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# Step 1: Load and Combine All Stock Data
def load_all_stocks(data_folder):
dataframes = []
for file_name in os.listdir(data_folder):
if file_name.endswith(".csv"):
stock_id = file_name.split('.')[0] # Use filename as stock_id
df = pd.read_csv(os.path.join(data_folder, file_name))
df["stock_id"] = stock_id
dataframes.append(df)
combined_df = pd.concat(dataframes)
return combined_df
data_folder = "ss" # Replace with your folder path
df = load_all_stocks(data_folder)
# Step 2: Normalize Data
scaler = MinMaxScaler()
df[['Open', 'High', 'Low', 'Close', 'Volume']] = scaler.fit_transform(df[['Open', 'High', 'Low', 'Close', 'Volume']])
# Step 3: Define Dataset Class for Multiple Stocks
class MultiStockDataset(Dataset):
def __init__(self, df, seq_length=10):
self.seq_length = seq_length
self.data = df[['stock_id', 'Open', 'High', 'Low', 'Close', 'Volume']].copy()
self.data['stock_id'] = pd.Categorical(self.data['stock_id']).codes # Encode stock IDs as integers
self.targets = self.create_targets(df)
def create_targets(self, df):
targets = []
for stock_id, group in df.groupby('stock_id'):
close_prices = group['Close'].values
stock_targets = [
1 if close_prices[i + self.seq_length] > max(close_prices[i:i + self.seq_length]) else 0
for i in range(len(close_prices) - self.seq_length)
]
targets.extend(stock_targets)
return targets
def __len__(self):
return len(self.targets)
def __getitem__(self, idx):
x = torch.tensor(self.data.iloc[idx:idx + self.seq_length, 1:].values, dtype=torch.float32) # Exclude stock_id
y = torch.tensor(self.targets[idx], dtype=torch.long)
return x, y
# Step 4: Load GPT-2 Model and Modify for Stock Prediction
config = GPT2Config.from_pretrained("openai-community/gpt2")
gpt2_model = GPT2Model.from_pretrained("openai-community/gpt2", config=config)
class StockGPT2(nn.Module):
def __init__(self, gpt2_model, input_dim=5, gpt_dim=768):
super(StockGPT2, self).__init__()
self.gpt2 = gpt2_model
self.projection = nn.Linear(input_dim, gpt_dim) # Project stock features to GPT-2 embedding size
self.out_head = nn.Linear(gpt_dim, 2) # Binary classification head
def forward(self, x):
x = self.projection(x)
outputs = self.gpt2(inputs_embeds=x)
last_hidden_state = outputs.last_hidden_state[:, -1, :]
return self.out_head(last_hidden_state)
# Step 5: Prepare DataLoader
seq_length = 10
batch_size = 8
dataset = MultiStockDataset(df, seq_length=seq_length)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Step 6: Set up Model and Optimizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = StockGPT2(gpt2_model).to(device)
# Freeze all parameters except last transformer block, layer norm, and output head for fine-tuning
for param in model.parameters():
param.requires_grad = False
for param in model.gpt2.h[-1].parameters():
param.requires_grad = True
for param in model.gpt2.ln_f.parameters():
param.requires_grad = True
for param in model.out_head.parameters():
param.requires_grad = True
optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)
# Step 7: Define Training Function
def train_model(model, train_loader, num_epochs=5):
model.train()
for epoch in range(num_epochs):
total_loss, correct_preds, total_preds = 0, 0, 0
for x, y in train_loader:
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
logits = model(x)
loss = F.cross_entropy(logits, y)
loss.backward()
optimizer.step()
total_loss += loss.item()
preds = torch.argmax(logits, dim=1)
correct_preds += (preds == y).sum().item()
total_preds += y.size(0)
accuracy = correct_preds / total_preds
print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}, Accuracy: {accuracy * 100:.2f}%")
# Step 8: Train the Model
train_model(model, train_loader, num_epochs=10)