ADD file via upload

2025-01-15 08:43:29 +08:00 · 2025-01-15 08:43:29 +08:00 · d08d381cf6
parent 344ad55fc0
commit d08d381cf6
1 changed files with 114 additions and 0 deletions
--- a/test1.py
+++ b/test1.py
@ -0,0 +1,114 @@
+import os
+import tempfile
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from ray import train
+from ray.train import Checkpoint
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+
+
+class ConvNet(nn.Module):
+    def __init__(self):
+        super(ConvNet, self).__init__()
+        # In this example, we don't change the model architecture
+        # due to simplicity.
+        self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
+        self.fc = nn.Linear(192, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 3))
+        x = x.view(-1, 192)
+        x = self.fc(x)
+        return F.log_softmax(x, dim=1)
+
+
+EPOCH = 10
+TRAIN_SIZE = 512
+TEST_SIZE = 256
+
+
+def train_func(model, optimizer, train_loader):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        # We set this just for the example to run quickly.
+        if batch_idx * len(data) > TRAIN_SIZE:
+            return
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+
+
+def test_func(model, data_loader):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.eval()
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for batch_idx, (data, target) in enumerate(data_loader):
+            # We set this just for the example to run quickly.
+            if batch_idx * len(data) > TEST_SIZE:
+                break
+            data, target = data.to(device), target.to(device)
+            outputs = model(data)
+            _, predicted = torch.max(outputs.data, 1)
+            total += target.size(0)
+            correct += (predicted == target).sum().item()
+
+    return correct / total
+
+
+# 主函数必须叫main，且只能有一个字典参数config
+def main(config):
+    # Data Setup
+
+
+    mnist_transforms = transforms.Compose(
+        [transforms.ToTensor(),
+         transforms.Normalize((0.1307,), (0.3081,))])
+
+    # 自定义读取数据的话，则要访问config["dataset_path"]
+
+    train_loader = DataLoader(
+        datasets.MNIST("/opt/ray/MNIST", train=True, download=True, transform=mnist_transforms),
+        batch_size=64,
+        shuffle=True)
+    test_loader = DataLoader(
+        datasets.MNIST("/opt/ray/MNIST", train=False, transform=mnist_transforms),
+        batch_size=64,
+        shuffle=True)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    model = ConvNet()
+    model.to(device)
+
+    optimizer = optim.SGD(
+        model.parameters(), lr=config["lr"], momentum=config["momentum"])
+
+
+    for i in range(EPOCH):
+        train_func(model, optimizer, train_loader)
+        acc = test_func(model, test_loader)
+
+        # 欲保存checkpoint，则必须按照以下代码规范编写
+        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+            checkpoint = None
+            if (i + 1) % 5 == 0:
+                # This saves the model to the trial directory
+                torch.save(
+                    model.state_dict(),
+                    os.path.join(temp_checkpoint_dir, "model.pth")
+                )
+                checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
+
+            # 每一个epoch必须向tune报告指标，指标名称必须与启动命令里的指标名称相同
+            # Send the current training result back to Tune
+            train.report({"mean_accuracy": acc}, checkpoint=checkpoint)