From 5451ff84b1e8fc4f0ce3c4a3b60fa48086e35dfb Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Mon, 15 May 2023 06:53:08 +0000
Subject: [PATCH 01/18] add mocov2

---
 passl/data/preprocess/basic_transforms.py     |  19 +
 passl/engine/loops/loop.py                    |   4 +
 passl/models/__init__.py                      |   2 +-
 passl/models/mocov2.py                        | 325 ++++++++++++++++++
 passl/scheduler/__init__.py                   |   2 +-
 passl/scheduler/lr_scheduler.py               |  56 +++
 tasks/ssl/mocov2/builder_moco.py              | 159 +++++++++
 .../configs/mocov2_resnet50_lp_in1k_1n8c.yaml | 114 ++++++
 .../configs/mocov2_resnet50_pt_in1k_1n8c.yaml |  97 ++++++
 tasks/ssl/mocov2/dataset                      |   1 +
 tasks/ssl/mocov2/extract_weight.py            |  56 +++
 tasks/ssl/mocov2/linearprobe.sh               |  26 ++
 tasks/ssl/mocov2/pretrain.sh                  |  26 ++
 13 files changed, 885 insertions(+), 2 deletions(-)
 create mode 100644 passl/models/mocov2.py
 create mode 100644 tasks/ssl/mocov2/builder_moco.py
 create mode 100644 tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
 create mode 100644 tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
 create mode 120000 tasks/ssl/mocov2/dataset
 create mode 100644 tasks/ssl/mocov2/extract_weight.py
 create mode 100644 tasks/ssl/mocov2/linearprobe.sh
 create mode 100644 tasks/ssl/mocov2/pretrain.sh

diff --git a/passl/data/preprocess/basic_transforms.py b/passl/data/preprocess/basic_transforms.py
index 7be2b26a..374b05b3 100644
--- a/passl/data/preprocess/basic_transforms.py
+++ b/passl/data/preprocess/basic_transforms.py
@@ -57,6 +57,7 @@
     "SimCLRGaussianBlur",
     "BYOLSolarize",
     "MAERandCropImage",
+    "GaussianBlur",
 ]
 
 
@@ -941,3 +942,21 @@ def __call__(self, img):
             else:
                 img = ImageOps.solarize(img)
         return img
+
+class GaussianBlur(object):
+    """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""
+
+    def __init__(self, sigma=[.1, 2.], p=1.0):
+        self.p = p
+        self.sigma = sigma
+
+    def __call__(self, img):
+        if random.random() < self.p:
+            if not isinstance(img, Image.Image):
+                img = np.ascontiguousarray(img)
+                img = Image.fromarray(img)
+            sigma = random.uniform(self.sigma[0], self.sigma[1])
+            img = img.filter(ImageFilter.GaussianBlur(radius=sigma))
+            if isinstance(img, Image.Image):
+                img = np.asarray(img)
+        return img
\ No newline at end of file
diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py
index 35bdfa1d..8d398feb 100644
--- a/passl/engine/loops/loop.py
+++ b/passl/engine/loops/loop.py
@@ -285,6 +285,10 @@ def train_one_epoch(self):
                     paddle.to_tensor(batch[0]['label'])
                 ]
 
+            for idx, value in enumerate(batch):
+                if isinstance(value,paddle.Tensor):
+                    batch[idx] = batch[idx].cuda()
+
             self.global_step += 1
 
             # do forward and backward
diff --git a/passl/models/__init__.py b/passl/models/__init__.py
index de3b9a8e..c78f3e65 100644
--- a/passl/models/__init__.py
+++ b/passl/models/__init__.py
@@ -28,7 +28,7 @@
 from .convnext import *
 from .mocov3 import *
 from .simsiam import *
-
+from .mocov2 import *
 __all__ = ["build_model"]
 
 
diff --git a/passl/models/mocov2.py b/passl/models/mocov2.py
new file mode 100644
index 00000000..20ef3adf
--- /dev/null
+++ b/passl/models/mocov2.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import os
+import copy
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from passl.nn import init
+import paddle.nn.functional as F
+from passl.models.base_model import Model
+from paddle.nn.initializer import Constant, Normal
+from functools import partial, reduce
+from passl.models.resnet import ResNet
+from paddle.vision.models.resnet import resnet50
+import random
+__all__ = [
+    'mocov2_resnet50_linearprobe',
+    'mocov2_resnet50_pretrain',
+]
+
+class MoCoV2Projector(nn.Layer):
+    def __init__(self, with_pool, in_dim, out_dim):
+        super().__init__()
+
+        self.with_pool = with_pool
+        if with_pool:
+            self.avgpool = nn.Sequential(
+                nn.AdaptiveAvgPool2D((1, 1)), nn.Flatten(start_axis=1))
+
+        self.mlp = nn.Sequential(nn.Linear(in_dim, out_dim), nn.ReLU())
+
+    def forward(self, x):
+
+        if self.with_pool:
+            x = self.avgpool(x)
+
+        x = self.mlp(x)
+        return x
+
+
+class MoCoClassifier(nn.Layer):
+    def __init__(self, with_pool, num_features, class_num):
+        super().__init__()
+
+        self.with_pool = with_pool
+        if with_pool:
+            self.avgpool = nn.Sequential(
+                nn.AdaptiveAvgPool2D((1, 1)), nn.Flatten(start_axis=1))
+
+        self.fc = nn.Linear(num_features, class_num)
+        normal_ = Normal(std=0.01)
+        zeros_ = Constant(value=0.)
+
+        normal_(self.fc.weight)
+        zeros_(self.fc.bias)
+
+    def save(self,path):
+        paddle.save(self.fc.state_dict(),path + ".pdparams")
+    def load(self,path):
+        self.fc.set_state_dict(paddle.load(path+".pdparams"))
+        
+
+    def forward(self, x):
+
+        if self.with_pool:
+            x = self.avgpool(x)
+        x = self.fc(x)
+        return x
+
+
+class MoCoV2Pretain(Model):
+    """ MoCo v1, v2
+    
+    ref: https://github.com/facebookresearch/moco/blob/main/moco/builder.py
+    ref: https://github.com/PaddlePaddle/PASSL/blob/main/passl/modeling/architectures/moco.py
+    """
+
+    def __init__(self,
+                 base_encoder,
+                 base_projector,
+                 base_classifier,
+                 momentum_encoder,
+                 momentum_projector,
+                 momentum_classifier,
+                 dim=128,
+                 K=65536,
+                 m=0.999,
+                 T=0.07,
+                 **kwargs):
+        super(MoCoV2Pretain, self).__init__()
+
+        self.m = m
+        self.T = T
+        self.K = K
+
+        self.base_encoder = nn.Sequential(base_encoder(), base_projector(),
+                                          base_classifier())
+        self.momentum_encoder = nn.Sequential(
+            momentum_encoder(), momentum_projector(), momentum_classifier())
+
+        for param_b, param_m in zip(self.base_encoder.parameters(),
+                                    self.momentum_encoder.parameters()):
+            param_m.copy_(param_b, False)  # initialize
+            param_m.stop_gradient = True  # not update by gradient
+
+        # create the queue
+        self.register_buffer("queue", paddle.randn([dim, K]))
+        self.queue = F.normalize(self.queue, axis=0)
+
+        self.register_buffer("queue_ptr", paddle.zeros([1], 'int64'))
+
+        self.loss_fuc = nn.CrossEntropyLoss()
+
+    def save(self, path, local_rank=0, rank=0):
+        paddle.save(self.state_dict(), path + ".pdparams")
+
+        # rename moco pre-trained keys
+        state_dict = self.state_dict()
+        for k in list(state_dict.keys()):
+            # retain only base_encoder up to before the embedding layer
+            if k.startswith('base_encoder') and not k.startswith(
+                    'base_encoder.head'):
+                # remove prefix
+                state_dict[k[len("base_encoder."):]] = state_dict[k]
+            # delete renamed or unused k
+            del state_dict[k]
+
+        paddle.save(state_dict, path + "_base_encoder.pdparams")
+
+    @paddle.no_grad()
+    def _update_momentum_encoder(self):
+        """Momentum update of the momentum encoder"""
+        #Note(GuoxiaWang): disable auto cast when use mix_precision
+        with paddle.amp.auto_cast(False):
+            for param_b, param_m in zip(self.base_encoder.parameters(),
+                                        self.momentum_encoder.parameters()):
+                paddle.assign((param_m * self.m + param_b * (1. - self.m)),
+                              param_m)
+                param_m.stop_gradient = True
+
+    # utils
+    @paddle.no_grad()
+    def concat_all_gather(self, tensor):
+        """
+        Performs all_gather operation on the provided tensors.
+        """
+        if paddle.distributed.get_world_size() < 2:
+            return tensor
+        tensors_gather = []
+        paddle.distributed.all_gather(tensors_gather, tensor)
+
+        output = paddle.concat(tensors_gather, axis=0)
+        return output
+    
+    @paddle.no_grad()
+    def _dequeue_and_enqueue(self, keys):
+        keys = self.concat_all_gather(keys)
+
+        batch_size = keys.shape[0]
+
+        ptr = int(self.queue_ptr[0])
+        assert self.K % batch_size == 0  # for simplicity
+
+        # replace the keys at ptr (dequeue and enqueue)
+        self.queue[:, ptr:ptr + batch_size] = keys.transpose([1, 0])
+        ptr = (ptr + batch_size) % self.K  # move pointer
+
+        self.queue_ptr[0] = ptr
+
+    @paddle.no_grad()
+    def _batch_shuffle_ddp(self, x):
+        """
+        Batch shuffle, for making use of BatchNorm.
+        *** Only support DistributedDataParallel (DDP) model. ***
+        """
+        # gather from all gpus
+        batch_size_this = x.shape[0]
+        x_gather = self.concat_all_gather(x)
+        batch_size_all = x_gather.shape[0]
+
+        num_gpus = batch_size_all // batch_size_this
+
+        # random shuffle index
+        idx_shuffle = paddle.randperm(batch_size_all)
+
+        # broadcast to all gpus
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.broadcast(idx_shuffle, src=0)
+
+        # index for restoring
+        idx_unshuffle = paddle.argsort(idx_shuffle)
+
+        # shuffled index for this gpu
+        gpu_idx = paddle.distributed.get_rank()
+        idx_this = idx_shuffle.reshape([num_gpus, -1])[gpu_idx]
+        return paddle.gather(x_gather, idx_this, axis=0), idx_unshuffle
+
+    @paddle.no_grad()
+    def _batch_unshuffle_ddp(self, x, idx_unshuffle):
+        """
+        Undo batch shuffle.
+        *** Only support DistributedDataParallel (DDP) model. ***
+        """
+        # gather from all gpus
+        batch_size_this = x.shape[0]
+        x_gather = self.concat_all_gather(x)
+        batch_size_all = x_gather.shape[0]
+
+        num_gpus = batch_size_all // batch_size_this
+
+        # restored index for this gpu
+        gpu_idx = paddle.distributed.get_rank()
+        idx_this = idx_unshuffle.reshape([num_gpus, -1])[gpu_idx]
+
+        return paddle.gather(x_gather, idx_this, axis=0)
+        
+    def forward(self, inputs):
+        assert isinstance(inputs, list)
+        x1 = inputs[0]
+        x2 = inputs[1]
+        # compute query features
+        q = self.base_encoder(x1)  # queries: NxC
+        q = F.normalize(q, axis=1)
+
+        # compute key features
+        with paddle.no_grad():  # no gradient
+            self._update_momentum_encoder()  # update the momentum encoder
+
+            # shuffle for making use of BN
+            k, idx_unshuffle = self._batch_shuffle_ddp(x2)
+
+            k = self.momentum_encoder(k)  # keys: NxC
+            k = F.normalize(k, axis=1)
+
+            # undo shuffle
+            k = self._batch_unshuffle_ddp(k, idx_unshuffle)
+
+        # compute logits
+        # Einstein sum is more intuitive
+        # positive logits: Nx1
+        l_pos = paddle.sum(q * k, axis=1).unsqueeze(-1)
+        # negative logits: NxK
+        l_neg = paddle.matmul(q, self.queue.clone().detach())
+
+        # logits: Nx(1+K)
+        logits = paddle.concat((l_pos, l_neg), axis=1)
+
+        # apply temperature
+        logits /= self.T
+
+        # labels: positive key indicators
+        labels = paddle.zeros([logits.shape[0]], dtype=paddle.int64)
+
+        # dequeue and enqueue
+        self._dequeue_and_enqueue(k)
+
+        return self.loss_fuc(logits, labels)
+
+class MoCoV2LinearProbe(ResNet):
+    """ MoCo v1, v2
+    
+    ref: https://github.com/facebookresearch/moco/blob/main/moco/builder.py
+    ref: https://github.com/PaddlePaddle/PASSL/blob/main/passl/modeling/architectures/moco.py
+    """
+
+    def __init__(self,
+                 **kwargs):
+        super().__init__()
+        # freeze all layers but the last fc
+        for name, param in self.named_parameters():
+            if name not in ['fc.weight', 'fc.bias']:
+                param.stop_gradient = True
+
+        # optimize only the linear classifier
+        parameters = list(
+            filter(lambda p: not p.stop_gradient, self.parameters()))
+        assert len(parameters) == 2  # weight, bias
+
+        init.normal_(self.fc.weight, mean=0.0, std=0.01)
+        init.zeros_(self.fc.bias)
+        self.apply(self._freeze_norm)
+
+    def _freeze_norm(self, layer):
+        if isinstance(layer, (nn.layer.norm._BatchNormBase)):
+            layer._use_global_stats = True
+
+
+def mocov2_resnet50_linearprobe(**kwargs):
+    # **kwargs specify numclass
+    resnet = MoCoV2LinearProbe(with_pool=True,**kwargs)
+    resnet.fc.load_dict(paddle.load("/wangguo/PASSL/pretrained/moco/class_fc.pdparams"))
+    return resnet
+def mocov2_resnet50_pretrain(**kwargs):
+    # prepare all layer here
+    base_encoder = partial(resnet50, with_pool=False,num_classes=0)
+    base_projector = partial(MoCoV2Projector, with_pool=True, in_dim=2048,out_dim=2048)
+    base_classifier = partial(MoCoClassifier, with_pool=False, num_features=2048, class_num=128)
+    momentum_encoder = partial(resnet50, with_pool=False, num_classes=0)
+    momentum_projector = partial(MoCoV2Projector,with_pool=True,in_dim=2048,out_dim=2048)
+    momentum_classifier = partial(MoCoClassifier,with_pool=False,num_features=2048,class_num=128)
+    model = MoCoV2Pretain(
+        base_encoder=base_encoder,
+        base_projector=base_projector,
+        base_classifier=base_classifier,
+        momentum_encoder=momentum_encoder,
+        momentum_projector=momentum_projector,
+        momentum_classifier=momentum_classifier,
+        T=0.2,
+        **kwargs)
+    return model
diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py
index 6bec1e45..8a194fd9 100644
--- a/passl/scheduler/__init__.py
+++ b/passl/scheduler/__init__.py
@@ -15,7 +15,7 @@
 
 from passl.utils import logger
 
-from .lr_scheduler import TimmCosine, ViTLRScheduler, Step, Poly
+from .lr_scheduler import TimmCosine, ViTLRScheduler, Step, Poly, MultiStepDecay, CosineDecay
 from .lr_callable import LRCallable
 
 
diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py
index 223ca349..eb9467d4 100644
--- a/passl/scheduler/lr_scheduler.py
+++ b/passl/scheduler/lr_scheduler.py
@@ -200,3 +200,59 @@ def get_lr(self):
 
         return self.base_lr * pow(1 - float(self.last_epoch - self.warmups) /
                                   float(self.T_max - self.warmups), 2)
+
+class MultiStepDecay(lr.LRScheduler):
+    def __init__(self,
+                 learning_rate,
+                 step_each_epoch,
+                 epochs,
+                 milestones,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False,
+                 decay_unit='epoch',
+                 **kwargs):
+        self.milestones = milestones
+        assert decay_unit in ['step', 'epoch']
+        if decay_unit=='step':
+            milestones = [mile*step_each_epoch for mile in milestones]
+        self.gamma = gamma
+        super().__init__(learning_rate, last_epoch, verbose)
+    def get_lr(self):
+        for i in range(len(self.milestones)):
+            if self.last_epoch < self.milestones[i]:
+                return self.base_lr * (self.gamma**i)
+        return self.base_lr * (self.gamma ** len(self.milestones))
+
+class CosineDecay(lr.LRScheduler):
+    def __init__(self,
+                 learning_rate,
+                 step_each_epoch,
+                 epochs,
+                 decay_unit='epoch',
+                 warmups=0,
+                 verbose=False,
+                 last_epoch=-1,
+                 **kwargs):
+        
+        assert decay_unit in ['step', 'epoch']
+        self.T_max = epochs if decay_unit == 'epoch' else step_each_epoch * epochs
+        self.warmups = warmups if decay_unit == 'epoch' else step_each_epoch * warmups
+
+        assert self.warmups < self.T_max
+
+        self.last_epoch = last_epoch
+        super(CosineDecay, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+
+        progress = (
+            self.last_epoch - self.warmups) / float(self.T_max - self.warmups)
+        progress = min(1.0, max(0.0, progress))
+
+        if self.warmups:
+            lr = lr * min(1.0, self.last_epoch / self.warmups)
+        else:
+            lr = 0.5 * self.base_lr * (1.0 + math.cos(math.pi * progress))
+
+        return lr
\ No newline at end of file
diff --git a/tasks/ssl/mocov2/builder_moco.py b/tasks/ssl/mocov2/builder_moco.py
new file mode 100644
index 00000000..e0e52326
--- /dev/null
+++ b/tasks/ssl/mocov2/builder_moco.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+
+class MoCo(nn.Layer):
+    """
+    Build a MoCo model with a base encoder, a momentum encoder, and two MLPs
+    https://arxiv.org/abs/1911.05722
+    """
+
+    def __init__(self, base_encoder, dim=256, mlp_dim=4096, T=1.0):
+        """
+        dim: feature dimension (default: 256)
+        mlp_dim: hidden dimension in MLPs (default: 4096)
+        T: softmax temperature (default: 1.0)
+        """
+        super(MoCo, self).__init__()
+
+        self.T = T
+
+        # build encoders
+        self.base_encoder = base_encoder(num_classes=mlp_dim)
+        self.momentum_encoder = base_encoder(num_classes=mlp_dim)
+
+        self._build_projector_and_predictor_mlps(dim, mlp_dim)
+
+        for param_b, param_m in zip(self.base_encoder.parameters(),
+                                    self.momentum_encoder.parameters()):
+            param_m.copy_(param_b, False)  # initialize
+            param_m.stop_gradient = True  # not update by gradient
+
+    def _build_mlp(self,
+                   num_layers,
+                   input_dim,
+                   mlp_dim,
+                   output_dim,
+                   last_bn=True):
+        mlp = []
+        for l in range(num_layers):
+            dim1 = input_dim if l == 0 else mlp_dim
+            dim2 = output_dim if l == num_layers - 1 else mlp_dim
+
+            mlp.append(nn.Linear(dim1, dim2, bias_attr=False))
+
+            if l < num_layers - 1:
+                mlp.append(nn.BatchNorm1D(dim2))
+                mlp.append(nn.ReLU())
+            elif last_bn:
+                # follow SimCLR's design: https://github.com/google-research/simclr/blob/master/model_util.py#L157
+                # for simplicity, we further removed gamma in BN
+                mlp.append(
+                    nn.BatchNorm1D(
+                        dim2, weight_attr=False, bias_attr=False))
+
+        return nn.Sequential(*mlp)
+
+    def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
+        pass
+
+    @paddle.no_grad()
+    def _update_momentum_encoder(self, m):
+        """Momentum update of the momentum encoder"""
+        with paddle.amp.auto_cast(False):
+            for param_b, param_m in zip(self.base_encoder.parameters(),
+                                        self.momentum_encoder.parameters()):
+                paddle.assign((param_m * m + param_b * (1. - m)), param_m)
+
+    def contrastive_loss(self, q, k):
+        # normalize
+        q = nn.functional.normalize(q, axis=1)
+        k = nn.functional.normalize(k, axis=1)
+        # gather all targets
+        k = concat_all_gather(k)
+        # Einstein sum is more intuitive
+        logits = paddle.einsum('nc,mc->nm', q, k) / self.T
+        N = logits.shape[0]  # batch size per GPU
+        labels = (paddle.arange(
+            N, dtype=paddle.int64) + N * paddle.distributed.get_rank())
+        return nn.CrossEntropyLoss()(logits, labels) * (2 * self.T)
+
+    def forward(self, x1, x2, m):
+        """
+        Input:
+            x1: first views of images
+            x2: second views of images
+            m: moco momentum
+        Output:
+            loss
+        """
+
+        # compute features
+        q1 = self.predictor(self.base_encoder(x1))
+        q2 = self.predictor(self.base_encoder(x2))
+
+        with paddle.no_grad():  # no gradient
+            self._update_momentum_encoder(m)  # update the momentum encoder
+
+            # compute momentum features as targets
+            k1 = self.momentum_encoder(x1)
+            k2 = self.momentum_encoder(x2)
+
+        return self.contrastive_loss(q1, k2) + self.contrastive_loss(q2, k1)
+
+
+class MoCo_ResNet(MoCo):
+    def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
+        hidden_dim = self.base_encoder.fc.weight.shape[0]
+        del self.base_encoder.fc, self.momentum_encoder.fc  # remove original fc layer
+
+        # projectors
+        self.base_encoder.fc = self._build_mlp(2, hidden_dim, mlp_dim, dim)
+        self.momentum_encoder.fc = self._build_mlp(2, hidden_dim, mlp_dim, dim)
+
+        # predictor
+        self.predictor = self._build_mlp(2, dim, mlp_dim, dim, False)
+
+
+class MoCo_ViT(MoCo):
+    def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
+        hidden_dim = self.base_encoder.head.weight.shape[0]
+        del self.base_encoder.head, self.momentum_encoder.head  # remove original fc layer
+
+        # projectors
+        self.base_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim, dim)
+        self.momentum_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim,
+                                                     dim)
+
+        # predictor
+        self.predictor = self._build_mlp(2, dim, mlp_dim, dim)
+
+
+# utils
+@paddle.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    """
+    if paddle.distributed.get_world_size() < 2:
+        return tensor
+
+    tensors_gather = []
+    paddle.distributed.all_gather(tensors_gather, tensor)
+
+    output = paddle.concat(tensors_gather, axis=0)
+    return output
diff --git a/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml b/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
new file mode 100644
index 00000000..b0fc5583
--- /dev/null
+++ b/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
@@ -0,0 +1,114 @@
+# global configs
+Global:
+  task_type: Classification
+  train_loop: ClassificationTrainingEpochLoop
+  validate_loop: ClassificationEvaluationLoop
+  checkpoint: null
+  pretrained_model: /wangguo/PASSL/pretrained/moco/transformed_mocov2_pt_imagenet2012_resnet50
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: True
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  seed: 2022
+
+# FP16 setting
+FP16:
+  level: O0
+
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: mocov2_resnet50_linearprobe
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+LRScheduler:
+  name: MultiStepDecay
+  decay_unit: epoch
+  learning_rate: 30.0
+  gamma: 0.1
+  milestones: [60, 80]
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 0.0
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageFolder
+      root: ./dataset/train
+      transform:
+        - RandomResizedCrop:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - ToTensor:
+        - Normalize:
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 1
+      use_shared_memory: False
+
+  Eval:
+    dataset:
+      name: ImageFolder
+      root: ./dataset/val
+      transform:
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CenterCropImage:
+            size: 224
+        - ToTensor:
+        - Normalize:
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+
+    loader:
+      num_workers: 1
+      use_shared_memory: False
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
+Export:
+  export_type: paddle
+  input_shape: [None, 3, 224, 224]
diff --git a/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml b/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
new file mode 100644
index 00000000..f97ce0c9
--- /dev/null
+++ b/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
@@ -0,0 +1,97 @@
+# global configs
+Global:
+  task_type: ContrastiveLearning
+  train_loop: ContrastiveLearningTrainingEpochLoop
+  validate_loop: None
+  checkpoint: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: False
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  seed: 2023
+  
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: mocov2_resnet50_pretrain
+
+LRScheduler:
+  name: CosineDecay
+  decay_unit: epoch
+  learning_rate: 0.03
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 0.0001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageFolder
+      root: ./dataset/train
+      transform:
+        - TwoViewsTransform:
+            base_transform1:
+              - RandomResizedCrop:
+                  size: 224
+                  scale: [0.2, 1.0]
+                  interpolation: bicubic
+              - ColorJitter:
+                  brightness: 0.4
+                  contrast: 0.4
+                  saturation: 0.4
+                  hue: 0.1
+                  p: 0.8
+              - RandomGrayscale:
+                  p: 0.2
+              - GaussianBlur:
+                  sigma: [.1, 2.]
+                  p: 0.5
+              - RandFlipImage:
+                  flip_code: 1
+              - ToTensor:
+              - Normalize:
+                  mean: [0.5, 0.5, 0.5]
+                  std: [0.5, 0.5, 0.5]
+            base_transform2:
+              - RandomResizedCrop:
+                  size: 224
+                  scale: [0.2, 1.0]
+                  interpolation: bicubic
+              - ColorJitter:
+                  brightness: 0.4
+                  contrast: 0.4
+                  saturation: 0.4
+                  hue: 0.1
+                  p: 0.8
+              - RandomGrayscale:
+                  p: 0.2
+              - GaussianBlur:
+                  sigma: [.1, 2.]
+                  p: 0.5
+              - RandFlipImage:
+                  flip_code: 1
+              - ToTensor:
+              - Normalize:
+                  mean: [0.5, 0.5, 0.5]
+                  std: [0.5, 0.5, 0.5]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: False
diff --git a/tasks/ssl/mocov2/dataset b/tasks/ssl/mocov2/dataset
new file mode 120000
index 00000000..93a401d4
--- /dev/null
+++ b/tasks/ssl/mocov2/dataset
@@ -0,0 +1 @@
+/wangguo/imagenet/
\ No newline at end of file
diff --git a/tasks/ssl/mocov2/extract_weight.py b/tasks/ssl/mocov2/extract_weight.py
new file mode 100644
index 00000000..5e7e1532
--- /dev/null
+++ b/tasks/ssl/mocov2/extract_weight.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import paddle
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Convert MoCo Pre-Traind Model to DEiT')
+    parser.add_argument(
+        '--input',
+        default='',
+        type=str,
+        metavar='PATH',
+        required=True,
+        help='path to moco pre-trained checkpoint')
+    parser.add_argument(
+        '--output',
+        default='',
+        type=str,
+        metavar='PATH',
+        required=True,
+        help='path to output checkpoint in DEiT format')
+    args = parser.parse_args()
+    print(args)
+
+    # load input
+    checkpoint = paddle.load(args.input)
+    state_dict = checkpoint['state_dict']
+    for k in list(state_dict.keys()):
+        # retain only base_encoder up to before the embedding layer
+        if k.startswith('base_encoder') and not k.startswith(
+                'base_encoder.head'):
+            # remove prefix
+            state_dict[k[len("base_encoder."):]] = state_dict[k]
+        # delete renamed or unused k
+        del state_dict[k]
+
+    # make output directory if necessary
+    output_dir = os.path.dirname(args.output)
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_dir)
+    # save to output
+    paddle.save(state_dict, args.output)
diff --git a/tasks/ssl/mocov2/linearprobe.sh b/tasks/ssl/mocov2/linearprobe.sh
new file mode 100644
index 00000000..f0cb339e
--- /dev/null
+++ b/tasks/ssl/mocov2/linearprobe.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# unset PADDLE_TRAINER_ENDPOINTS
+# export PADDLE_NNODES=1
+# export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
+# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov2_resnet50_lp_in1k_1n8c.yaml
diff --git a/tasks/ssl/mocov2/pretrain.sh b/tasks/ssl/mocov2/pretrain.sh
new file mode 100644
index 00000000..aeac93e3
--- /dev/null
+++ b/tasks/ssl/mocov2/pretrain.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# unset PADDLE_TRAINER_ENDPOINTS
+# export PADDLE_NNODES=1
+# #export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
+# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov2_resnet50_pt_in1k_1n8c.yaml
\ No newline at end of file

From 930ccabd594898de5dc26c5e7a66509bcc062591 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Mon, 15 May 2023 07:36:02 +0000
Subject: [PATCH 02/18] wrap weight

---
 passl/models/mocov2.py                        |  32 +++-
 passl/models/resnet.py                        |   9 +-
 tasks/ssl/mocov2/builder_moco.py              | 159 ------------------
 .../configs/mocov2_resnet50_lp_in1k_1n8c.yaml |   2 +-
 tasks/ssl/mocov2/extract_weight.py            |  56 ------
 5 files changed, 38 insertions(+), 220 deletions(-)
 delete mode 100644 tasks/ssl/mocov2/builder_moco.py
 delete mode 100644 tasks/ssl/mocov2/extract_weight.py

diff --git a/passl/models/mocov2.py b/passl/models/mocov2.py
index 20ef3adf..af24d096 100644
--- a/passl/models/mocov2.py
+++ b/passl/models/mocov2.py
@@ -125,7 +125,7 @@ def __init__(self,
         self.register_buffer("queue_ptr", paddle.zeros([1], 'int64'))
 
         self.loss_fuc = nn.CrossEntropyLoss()
-
+    
     def save(self, path, local_rank=0, rank=0):
         paddle.save(self.state_dict(), path + ".pdparams")
 
@@ -299,11 +299,33 @@ def _freeze_norm(self, layer):
         if isinstance(layer, (nn.layer.norm._BatchNormBase)):
             layer._use_global_stats = True
 
+    def load_pretrained(self, path, rank=0, finetune=False):
+        if not os.path.exists(path + '.pdparams'):
+            raise ValueError("Model pretrain path {} does not "
+                             "exists.".format(path))
+
+        path = path + ".pdparams"
+        base_encoder_dict = paddle.load(path)
+        for k in list(base_encoder_dict.keys()):
+            # retain only encoder_q up to before the embedding layer
+            if k.startswith('0.'):
+                # remove prefix
+                base_encoder_dict[k[len(
+                    "0."):]] = base_encoder_dict[k]
+                # delete renamed
+                del base_encoder_dict[k]
+
+        for name, param in self.state_dict().items():
+            if name in base_encoder_dict and param.dtype != base_encoder_dict[
+                    name].dtype:
+                base_encoder_dict[name] = base_encoder_dict[name].cast(
+                    param.dtype)
+
+        self.set_state_dict(base_encoder_dict)
 
 def mocov2_resnet50_linearprobe(**kwargs):
     # **kwargs specify numclass
     resnet = MoCoV2LinearProbe(with_pool=True,**kwargs)
-    resnet.fc.load_dict(paddle.load("/wangguo/PASSL/pretrained/moco/class_fc.pdparams"))
     return resnet
 def mocov2_resnet50_pretrain(**kwargs):
     # prepare all layer here
@@ -323,3 +345,9 @@ def mocov2_resnet50_pretrain(**kwargs):
         T=0.2,
         **kwargs)
     return model
+
+if __name__ == "__main__":
+    model = mocov2_resnet50_pretrain()
+    model.save("./mocov2")
+    model_lineprobe = mocov2_resnet50_linearprobe()
+    model_lineprobe.load_pretrained("./mocov2_base_encoder")
diff --git a/passl/models/resnet.py b/passl/models/resnet.py
index f15f3443..211c2878 100644
--- a/passl/models/resnet.py
+++ b/passl/models/resnet.py
@@ -52,14 +52,19 @@
 class ResNet(PDResNet, Model):
     def __init__(
         self,
-        block,
+        block=None,
         depth=50,
         width=64,
         class_num=1000,
         with_pool=True,
         groups=1,
         zero_init_residual=True,
-    ):
+        ):
+        if block == None:
+            if depth <= 34:
+                block=BasicBlock
+            else:
+                block=BottleneckBlock
         super().__init__(block, depth=depth, width=width, num_classes=class_num, with_pool=with_pool, groups=groups)
 
         # Zero-initialize the last BN in each residual branch,
diff --git a/tasks/ssl/mocov2/builder_moco.py b/tasks/ssl/mocov2/builder_moco.py
deleted file mode 100644
index e0e52326..00000000
--- a/tasks/ssl/mocov2/builder_moco.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-
-
-class MoCo(nn.Layer):
-    """
-    Build a MoCo model with a base encoder, a momentum encoder, and two MLPs
-    https://arxiv.org/abs/1911.05722
-    """
-
-    def __init__(self, base_encoder, dim=256, mlp_dim=4096, T=1.0):
-        """
-        dim: feature dimension (default: 256)
-        mlp_dim: hidden dimension in MLPs (default: 4096)
-        T: softmax temperature (default: 1.0)
-        """
-        super(MoCo, self).__init__()
-
-        self.T = T
-
-        # build encoders
-        self.base_encoder = base_encoder(num_classes=mlp_dim)
-        self.momentum_encoder = base_encoder(num_classes=mlp_dim)
-
-        self._build_projector_and_predictor_mlps(dim, mlp_dim)
-
-        for param_b, param_m in zip(self.base_encoder.parameters(),
-                                    self.momentum_encoder.parameters()):
-            param_m.copy_(param_b, False)  # initialize
-            param_m.stop_gradient = True  # not update by gradient
-
-    def _build_mlp(self,
-                   num_layers,
-                   input_dim,
-                   mlp_dim,
-                   output_dim,
-                   last_bn=True):
-        mlp = []
-        for l in range(num_layers):
-            dim1 = input_dim if l == 0 else mlp_dim
-            dim2 = output_dim if l == num_layers - 1 else mlp_dim
-
-            mlp.append(nn.Linear(dim1, dim2, bias_attr=False))
-
-            if l < num_layers - 1:
-                mlp.append(nn.BatchNorm1D(dim2))
-                mlp.append(nn.ReLU())
-            elif last_bn:
-                # follow SimCLR's design: https://github.com/google-research/simclr/blob/master/model_util.py#L157
-                # for simplicity, we further removed gamma in BN
-                mlp.append(
-                    nn.BatchNorm1D(
-                        dim2, weight_attr=False, bias_attr=False))
-
-        return nn.Sequential(*mlp)
-
-    def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
-        pass
-
-    @paddle.no_grad()
-    def _update_momentum_encoder(self, m):
-        """Momentum update of the momentum encoder"""
-        with paddle.amp.auto_cast(False):
-            for param_b, param_m in zip(self.base_encoder.parameters(),
-                                        self.momentum_encoder.parameters()):
-                paddle.assign((param_m * m + param_b * (1. - m)), param_m)
-
-    def contrastive_loss(self, q, k):
-        # normalize
-        q = nn.functional.normalize(q, axis=1)
-        k = nn.functional.normalize(k, axis=1)
-        # gather all targets
-        k = concat_all_gather(k)
-        # Einstein sum is more intuitive
-        logits = paddle.einsum('nc,mc->nm', q, k) / self.T
-        N = logits.shape[0]  # batch size per GPU
-        labels = (paddle.arange(
-            N, dtype=paddle.int64) + N * paddle.distributed.get_rank())
-        return nn.CrossEntropyLoss()(logits, labels) * (2 * self.T)
-
-    def forward(self, x1, x2, m):
-        """
-        Input:
-            x1: first views of images
-            x2: second views of images
-            m: moco momentum
-        Output:
-            loss
-        """
-
-        # compute features
-        q1 = self.predictor(self.base_encoder(x1))
-        q2 = self.predictor(self.base_encoder(x2))
-
-        with paddle.no_grad():  # no gradient
-            self._update_momentum_encoder(m)  # update the momentum encoder
-
-            # compute momentum features as targets
-            k1 = self.momentum_encoder(x1)
-            k2 = self.momentum_encoder(x2)
-
-        return self.contrastive_loss(q1, k2) + self.contrastive_loss(q2, k1)
-
-
-class MoCo_ResNet(MoCo):
-    def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
-        hidden_dim = self.base_encoder.fc.weight.shape[0]
-        del self.base_encoder.fc, self.momentum_encoder.fc  # remove original fc layer
-
-        # projectors
-        self.base_encoder.fc = self._build_mlp(2, hidden_dim, mlp_dim, dim)
-        self.momentum_encoder.fc = self._build_mlp(2, hidden_dim, mlp_dim, dim)
-
-        # predictor
-        self.predictor = self._build_mlp(2, dim, mlp_dim, dim, False)
-
-
-class MoCo_ViT(MoCo):
-    def _build_projector_and_predictor_mlps(self, dim, mlp_dim):
-        hidden_dim = self.base_encoder.head.weight.shape[0]
-        del self.base_encoder.head, self.momentum_encoder.head  # remove original fc layer
-
-        # projectors
-        self.base_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim, dim)
-        self.momentum_encoder.head = self._build_mlp(3, hidden_dim, mlp_dim,
-                                                     dim)
-
-        # predictor
-        self.predictor = self._build_mlp(2, dim, mlp_dim, dim)
-
-
-# utils
-@paddle.no_grad()
-def concat_all_gather(tensor):
-    """
-    Performs all_gather operation on the provided tensors.
-    """
-    if paddle.distributed.get_world_size() < 2:
-        return tensor
-
-    tensors_gather = []
-    paddle.distributed.all_gather(tensors_gather, tensor)
-
-    output = paddle.concat(tensors_gather, axis=0)
-    return output
diff --git a/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml b/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
index b0fc5583..33f759f3 100644
--- a/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
+++ b/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
@@ -4,7 +4,7 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: /wangguo/PASSL/pretrained/moco/transformed_mocov2_pt_imagenet2012_resnet50
+  pretrained_model: ./output/mocov2_resnet50_pretrain/epoch_96_base_encoder
   output_dir: ./output/
   device: gpu
   save_interval: 1
diff --git a/tasks/ssl/mocov2/extract_weight.py b/tasks/ssl/mocov2/extract_weight.py
deleted file mode 100644
index 5e7e1532..00000000
--- a/tasks/ssl/mocov2/extract_weight.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import paddle
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Convert MoCo Pre-Traind Model to DEiT')
-    parser.add_argument(
-        '--input',
-        default='',
-        type=str,
-        metavar='PATH',
-        required=True,
-        help='path to moco pre-trained checkpoint')
-    parser.add_argument(
-        '--output',
-        default='',
-        type=str,
-        metavar='PATH',
-        required=True,
-        help='path to output checkpoint in DEiT format')
-    args = parser.parse_args()
-    print(args)
-
-    # load input
-    checkpoint = paddle.load(args.input)
-    state_dict = checkpoint['state_dict']
-    for k in list(state_dict.keys()):
-        # retain only base_encoder up to before the embedding layer
-        if k.startswith('base_encoder') and not k.startswith(
-                'base_encoder.head'):
-            # remove prefix
-            state_dict[k[len("base_encoder."):]] = state_dict[k]
-        # delete renamed or unused k
-        del state_dict[k]
-
-    # make output directory if necessary
-    output_dir = os.path.dirname(args.output)
-    if not os.path.isdir(output_dir):
-        os.makedirs(output_dir)
-    # save to output
-    paddle.save(state_dict, args.output)

From e1416ceaa9d840e227fe099a1d4d5a1a1b48ba05 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Mon, 15 May 2023 08:37:47 +0000
Subject: [PATCH 03/18] modify contrastive learning

---
 passl/engine/loops/contrastive_learning_loop.py | 3 +++
 passl/engine/loops/loop.py                      | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py
index a772a28d..1406aa30 100644
--- a/passl/engine/loops/contrastive_learning_loop.py
+++ b/passl/engine/loops/contrastive_learning_loop.py
@@ -74,6 +74,9 @@ def train_one_step(self, batch):
         # remove label
         batch = batch[0]
 
+        for idx, value in enumerate(batch):
+            if isinstance(value,paddle.Tensor):
+                batch[idx] = batch[idx].cuda()
         # do forward and backward
         loss_dict = self.forward_backward(batch)
 
diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py
index 8d398feb..9d8c085f 100644
--- a/passl/engine/loops/loop.py
+++ b/passl/engine/loops/loop.py
@@ -285,9 +285,6 @@ def train_one_epoch(self):
                     paddle.to_tensor(batch[0]['label'])
                 ]
 
-            for idx, value in enumerate(batch):
-                if isinstance(value,paddle.Tensor):
-                    batch[idx] = batch[idx].cuda()
 
             self.global_step += 1
 

From f5bdf6e277a3214b4283d55ffe70e299bfc03ad8 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Tue, 16 May 2023 02:24:20 +0000
Subject: [PATCH 04/18] add README.md with mocov2

---
 tasks/ssl/mocov2/README.md                    | 102 ++++++++++++++++++
 .../configs/mocov2_resnet50_lp_in1k_1n8c.yaml |  11 +-
 .../configs/mocov2_resnet50_pt_in1k_1n8c.yaml |   2 +-
 3 files changed, 109 insertions(+), 6 deletions(-)
 create mode 100644 tasks/ssl/mocov2/README.md

diff --git a/tasks/ssl/mocov2/README.md b/tasks/ssl/mocov2/README.md
new file mode 100644
index 00000000..45614170
--- /dev/null
+++ b/tasks/ssl/mocov2/README.md
@@ -0,0 +1,102 @@
+# MoCo
+![MoCo](https://user-images.githubusercontent.com/11435359/71603927-0ca98d00-2b14-11ea-9fd8-10d984a2de45.png)
+
+This is a PaddlePaddle implementation of the 
+[MoCov2](https://arxiv.org/abs/2003.04297).
+
+
+## Install Preparation
+
+MoCo requires `PaddlePaddle >= 2.4`.
+```shell
+# git clone https://github.com/PaddlePaddle/PASSL.git
+# cd /path/to/PASSL
+```
+
+All commands are executed in the `PASSL` root directory.
+
+```shell
+# python setup.py install
+```
+
+## Data Preparation
+
+The imagenet 1k dataset needs to be prepared first and will be organized into the following directory structure.
+
+```shell
+ILSVRC2012
+├── train/
+├── xxx
+├── val/
+└── xxx
+```
+
+Then configure the path.
+
+```shell
+mkdir -p dataset
+ln -s /path/to/ILSVRC2012 dataset/ILSVRC2012
+```
+
+## Unsupervised Training
+
+To do unsupervised pre-training of a ResNet-50 model on ImageNet in an 8-gpu machine, you can run the script: 
+
+### MoCo V2 (Single Node with 8 GPUs)
+```shell
+# sh pretrain.sh
+```
+
+The differences between MoCo v2 and MoCo v1 are as follows:
+* MoCo v2 has a projector
+* Data augmentation
+* Softmax temperature
+* Learning rate scheduler
+
+## Linear Classification
+
+When the unsupervised pre-training is complete, or directly download the provided pre-training checkpoint, you can use the following script to train a supervised linear classifier.
+
+#### Linear Classification Training (Single Node with 8 GPUs)
+
+```shell
+# sh linearprobe.sh
+```
+
+### MoCo v2
+
+#### [Optional] Download checkpoint & Modify yaml  configure
+```shell
+mkdir -p pretrained/moco/
+wget -O ./pretrained/moco/mocov2_pt_imagenet2012_resnet50.pdparams https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.pdparams
+```
+
+#### Linear Classification Training (Single Node with 8 GPUs)
+
+```shell
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov2_resnet50_lp_in1k_1n8c.yaml
+```
+
+## Models
+
+| Model   | Phase                 | Epochs | Top1 Acc | Checkpoint                                                   | Log                                                          |
+| ------- | --------------------- | ------ | -------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| MoCo v2 | Unsupervised Training | 200    | -        | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_pt_imagenet2012_resnet50.log) |
+| MoCo v2 | Linear Classification | 100    | 0.676595 | [download](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_lincls_imagenet2012_resnet50.pdparams) | [log](https://paddlefleetx.bj.bcebos.com/model/vision/moco/mocov2_lincls_imagenet2012_resnet50.log) |
+
+
+## Citations
+
+```
+@Article{chen2020mocov2,
+  author  = {Xinlei Chen and Haoqi Fan and Ross Girshick and Kaiming He},
+  title   = {Improved Baselines with Momentum Contrastive Learning},
+  journal = {arXiv preprint arXiv:2003.04297},
+  year    = {2020},
+}
+```
diff --git a/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml b/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
index 33f759f3..b60e5853 100644
--- a/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
+++ b/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
@@ -4,7 +4,7 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: ./output/mocov2_resnet50_pretrain/epoch_96_base_encoder
+  pretrained_model: ./path/to/pretrain
   output_dir: ./output/
   device: gpu
   save_interval: 1
@@ -66,14 +66,15 @@ DataLoader:
         - Normalize:
             mean: [0.5, 0.5, 0.5]
             std: [0.5, 0.5, 0.5]
+            
     sampler:
       name: DistributedBatchSampler
       batch_size: 32
       drop_last: True
       shuffle: True
     loader:
-      num_workers: 1
-      use_shared_memory: False
+      num_workers: 8
+      use_shared_memory: True
 
   Eval:
     dataset:
@@ -98,8 +99,8 @@ DataLoader:
       shuffle: False
 
     loader:
-      num_workers: 1
-      use_shared_memory: False
+      num_workers: 8
+      use_shared_memory: True
 
 Metric:
   Train:
diff --git a/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml b/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
index f97ce0c9..3e599005 100644
--- a/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
+++ b/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
@@ -94,4 +94,4 @@ DataLoader:
       shuffle: True
     loader:
       num_workers: 8
-      use_shared_memory: False
+      use_shared_memory: True

From 2d303a222263de1d3349863b4276ba8707a8c6f9 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Tue, 16 May 2023 02:27:16 +0000
Subject: [PATCH 05/18] modify README

---
 tasks/ssl/mocov2/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tasks/ssl/mocov2/README.md b/tasks/ssl/mocov2/README.md
index 45614170..42802d21 100644
--- a/tasks/ssl/mocov2/README.md
+++ b/tasks/ssl/mocov2/README.md
@@ -9,14 +9,14 @@ This is a PaddlePaddle implementation of the
 
 MoCo requires `PaddlePaddle >= 2.4`.
 ```shell
-# git clone https://github.com/PaddlePaddle/PASSL.git
-# cd /path/to/PASSL
+git clone https://github.com/PaddlePaddle/PASSL.git
+cd /path/to/PASSL
 ```
 
 All commands are executed in the `PASSL` root directory.
 
 ```shell
-# python setup.py install
+python setup.py install
 ```
 
 ## Data Preparation
@@ -44,7 +44,7 @@ To do unsupervised pre-training of a ResNet-50 model on ImageNet in an 8-gpu mac
 
 ### MoCo V2 (Single Node with 8 GPUs)
 ```shell
-# sh pretrain.sh
+sh pretrain.sh
 ```
 
 The differences between MoCo v2 and MoCo v1 are as follows:
@@ -60,7 +60,7 @@ When the unsupervised pre-training is complete, or directly download the provide
 #### Linear Classification Training (Single Node with 8 GPUs)
 
 ```shell
-# sh linearprobe.sh
+sh linearprobe.sh
 ```
 
 ### MoCo v2

From 3d5a1dd1546e40981d59f37770d590b875792069 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Tue, 16 May 2023 02:28:54 +0000
Subject: [PATCH 06/18] modify README

---
 tasks/ssl/mocov2/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/ssl/mocov2/README.md b/tasks/ssl/mocov2/README.md
index 42802d21..8b918a1c 100644
--- a/tasks/ssl/mocov2/README.md
+++ b/tasks/ssl/mocov2/README.md
@@ -56,6 +56,7 @@ The differences between MoCo v2 and MoCo v1 are as follows:
 ## Linear Classification
 
 When the unsupervised pre-training is complete, or directly download the provided pre-training checkpoint, you can use the following script to train a supervised linear classifier.
+### MoCo v2
 
 #### Linear Classification Training (Single Node with 8 GPUs)
 
@@ -63,7 +64,6 @@ When the unsupervised pre-training is complete, or directly download the provide
 sh linearprobe.sh
 ```
 
-### MoCo v2
 
 #### [Optional] Download checkpoint & Modify yaml  configure
 ```shell

From 179eb20ab2308870c3673e207235eabe3f1daa60 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Tue, 16 May 2023 02:33:45 +0000
Subject: [PATCH 07/18] modify README

---
 tasks/ssl/mocov2/README.md | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/tasks/ssl/mocov2/README.md b/tasks/ssl/mocov2/README.md
index 8b918a1c..77c50e7f 100644
--- a/tasks/ssl/mocov2/README.md
+++ b/tasks/ssl/mocov2/README.md
@@ -11,13 +11,11 @@ MoCo requires `PaddlePaddle >= 2.4`.
 ```shell
 git clone https://github.com/PaddlePaddle/PASSL.git
 cd /path/to/PASSL
+python setup.py install
 ```
 
-All commands are executed in the `PASSL` root directory.
+All commands are executed in the subdirectory of `tasks` directory.
 
-```shell
-python setup.py install
-```
 
 ## Data Preparation
 
@@ -47,12 +45,6 @@ To do unsupervised pre-training of a ResNet-50 model on ImageNet in an 8-gpu mac
 sh pretrain.sh
 ```
 
-The differences between MoCo v2 and MoCo v1 are as follows:
-* MoCo v2 has a projector
-* Data augmentation
-* Softmax temperature
-* Learning rate scheduler
-
 ## Linear Classification
 
 When the unsupervised pre-training is complete, or directly download the provided pre-training checkpoint, you can use the following script to train a supervised linear classifier.

From 39a14ef7195586ea81bb14e748ae32153d15ee57 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Tue, 16 May 2023 02:34:58 +0000
Subject: [PATCH 08/18] delete dataset link

---
 tasks/ssl/mocov2/dataset | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 tasks/ssl/mocov2/dataset

diff --git a/tasks/ssl/mocov2/dataset b/tasks/ssl/mocov2/dataset
deleted file mode 120000
index 93a401d4..00000000
--- a/tasks/ssl/mocov2/dataset
+++ /dev/null
@@ -1 +0,0 @@
-/wangguo/imagenet/
\ No newline at end of file

From 3d819e884b0850a33aec4ced5901a3786fa2a365 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Tue, 16 May 2023 02:58:04 +0000
Subject: [PATCH 09/18] modify README,yaml

---
 tasks/ssl/mocov2/README.md                    | 29 ++++++++++++++-----
 .../configs/mocov2_resnet50_lp_in1k_1n8c.yaml |  8 ++---
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/tasks/ssl/mocov2/README.md b/tasks/ssl/mocov2/README.md
index 77c50e7f..af5bd4b5 100644
--- a/tasks/ssl/mocov2/README.md
+++ b/tasks/ssl/mocov2/README.md
@@ -1,4 +1,4 @@
-# MoCo
+# MoCov2
 ![MoCo](https://user-images.githubusercontent.com/11435359/71603927-0ca98d00-2b14-11ea-9fd8-10d984a2de45.png)
 
 This is a PaddlePaddle implementation of the 
@@ -7,14 +7,14 @@ This is a PaddlePaddle implementation of the
 
 ## Install Preparation
 
-MoCo requires `PaddlePaddle >= 2.4`.
+MoCoV2 requires `PaddlePaddle >= 2.4`.
 ```shell
 git clone https://github.com/PaddlePaddle/PASSL.git
 cd /path/to/PASSL
 python setup.py install
 ```
 
-All commands are executed in the subdirectory of `tasks` directory.
+All commands are executed in the `tasks/ssl/mocov2/` directory.
 
 
 ## Data Preparation
@@ -24,9 +24,7 @@ The imagenet 1k dataset needs to be prepared first and will be organized into th
 ```shell
 ILSVRC2012
 ├── train/
-├── xxx
-├── val/
-└── xxx
+└── val/
 ```
 
 Then configure the path.
@@ -42,7 +40,13 @@ To do unsupervised pre-training of a ResNet-50 model on ImageNet in an 8-gpu mac
 
 ### MoCo V2 (Single Node with 8 GPUs)
 ```shell
-sh pretrain.sh
+export FLAGS_stop_check_timeout=3600
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov2_resnet50_pt_in1k_1n8c.yaml
 ```
 
 ## Linear Classification
@@ -53,7 +57,12 @@ When the unsupervised pre-training is complete, or directly download the provide
 #### Linear Classification Training (Single Node with 8 GPUs)
 
 ```shell
-sh linearprobe.sh
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov2_resnet50_lp_in1k_1n8c.yaml
 ```
 
 
@@ -72,7 +81,11 @@ python -m paddle.distributed.launch \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
     -c ./configs/mocov2_resnet50_lp_in1k_1n8c.yaml
+    -o Global.pretrained_model=./pretrained/mocov3/mocov3_vit_base_in1k_300ep_pretrained
+
 ```
+## Other Configurations
+We provide more directly runnable configurations, see [MoCoV2 Configurations](./configs/).
 
 ## Models
 
diff --git a/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml b/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
index b60e5853..c39f6c2a 100644
--- a/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
+++ b/tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
@@ -4,7 +4,7 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: ./path/to/pretrain
+  pretrained_model: ./output/mocov2_resnet50_pretrain/latest_base_encoder
   output_dir: ./output/
   device: gpu
   save_interval: 1
@@ -56,7 +56,7 @@ DataLoader:
   Train:
     dataset:
       name: ImageFolder
-      root: ./dataset/train
+      root: ./dataset/ILSVRC2012/train
       transform:
         - RandomResizedCrop:
             size: 224
@@ -66,7 +66,7 @@ DataLoader:
         - Normalize:
             mean: [0.5, 0.5, 0.5]
             std: [0.5, 0.5, 0.5]
-            
+
     sampler:
       name: DistributedBatchSampler
       batch_size: 32
@@ -79,7 +79,7 @@ DataLoader:
   Eval:
     dataset:
       name: ImageFolder
-      root: ./dataset/val
+      root: ./dataset/ILSVRC2012/val
       transform:
         - ResizeImage:
             resize_short: 256

From ed74f55dad82c71ffb8dfe3f77692e4a63f0feed Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Tue, 16 May 2023 02:59:33 +0000
Subject: [PATCH 10/18] modify

---
 tasks/ssl/mocov2/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tasks/ssl/mocov2/README.md b/tasks/ssl/mocov2/README.md
index af5bd4b5..a104ca53 100644
--- a/tasks/ssl/mocov2/README.md
+++ b/tasks/ssl/mocov2/README.md
@@ -40,7 +40,6 @@ To do unsupervised pre-training of a ResNet-50 model on ImageNet in an 8-gpu mac
 
 ### MoCo V2 (Single Node with 8 GPUs)
 ```shell
-export FLAGS_stop_check_timeout=3600
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \

From 177678cd6fc933e2dff16b025135356affc21525 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Thu, 25 May 2023 08:00:53 +0000
Subject: [PATCH 11/18] moco

---
 tests/CI/case.sh                              | 49 +++++++++++++++++++
 .../mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh | 31 ++++++++++++
 .../mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh | 31 ++++++++++++
 3 files changed, 111 insertions(+)
 create mode 100644 tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
 create mode 100644 tests/CI/ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index 7f428b97..dd9e67c9 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -40,6 +40,8 @@ function model_list(){
     mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1
     simsiam_resnet50_pt_in1k_1n8c_dp_fp32
     simsiam_resnet50_lp_in1k_1n8c_dp_fp32
+    mocov2_resnet50_pt_in1k_1n8c_dp_fp32
+    mocov2_resnet50_lp_in1k_1n8c_dp_fp32
 }
 
 ############ case start ############
@@ -387,6 +389,53 @@ function simsiam_resnet50_lp_in1k_1n8c_dp_fp32() {
     echo "=========== $FUNCNAME run  end ==========="
 }
 
+function simsiam_resnet50_pt_in1k_1n8c_dp_fp32() {
+      echo "=========== $FUNCNAME run begin ==========="
+    rm -rf log
+    bash ./ssl/simsiam/simsiam_resnet50_pt_in1k_1n8c_dp_fp32.sh
+
+    loss=`cat log/workerlog.0 | grep '50/2502' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
+    mem=`cat log/workerlog.0 | grep '50/2502' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=-0.32798
+    ips_base=1731.37
+    mem_base=10.55
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
+###### MocoV2 ######
+
+function mocov2_resnet50_lp_in1k_1n8c_dp_fp32() {
+      echo "=========== $FUNCNAME run begin ==========="
+    rm -rf log
+    bash ./ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c.sh
+
+    loss=`cat log/workerlog.0 | grep '50/313' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
+    mem=`cat log/workerlog.0 | grep '50/313' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=6.89298
+    ips_base=6285.21
+    mem_base=5.38
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
+function mocov2_resnet50_pt_in1k_1n8c_dp_fp32() {
+      echo "=========== $FUNCNAME run begin ==========="
+    rm -rf log
+    bash ./ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c.sh
+
+    loss=`cat log/workerlog.0 | grep '50/2502' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
+    mem=`cat log/workerlog.0 | grep '50/2502' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=-0.32798
+    ips_base=1731.37
+    mem_base=10.55
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
 function check_result() {
     if [ $? -ne 0 ];then
       echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log
diff --git a/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
new file mode 100644
index 00000000..04e3c83b
--- /dev/null
+++ b/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
@@ -0,0 +1,31 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# unset PADDLE_TRAINER_ENDPOINTS
+# export PADDLE_NNODES=1
+# export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
+# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ../../tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
+    -o Global.print_batch_step=1 \
+    -o Global.max_train_step=50 \
+    -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
+    -o Global.flags.FLAGS_cudnn_deterministic=1 \
+    -o DataLoader.Train.sampler.batch_size=64
diff --git a/tests/CI/ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh b/tests/CI/ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh
new file mode 100644
index 00000000..381adf0c
--- /dev/null
+++ b/tests/CI/ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh
@@ -0,0 +1,31 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# unset PADDLE_TRAINER_ENDPOINTS
+# export PADDLE_NNODES=1
+# #export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
+# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ../../tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
+    -o Global.print_batch_step=1 \
+    -o Global.max_train_step=50 \
+    -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
+    -o Global.flags.FLAGS_cudnn_deterministic=1 \
+    -o DataLoader.Train.sampler.batch_size=64
\ No newline at end of file

From 130b4b92a898279df976c4d80024fe34677c24c0 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Fri, 26 May 2023 08:38:55 +0000
Subject: [PATCH 12/18] add CI

---
 tests/CI/case.sh | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index dd9e67c9..e082ce8b 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -411,12 +411,12 @@ function mocov2_resnet50_lp_in1k_1n8c_dp_fp32() {
     rm -rf log
     bash ./ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c.sh
 
-    loss=`cat log/workerlog.0 | grep '50/313' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    loss=`cat log/workerlog.0 | grep '50/5004' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
-    mem=`cat log/workerlog.0 | grep '50/313' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=6.89298
-    ips_base=6285.21
-    mem_base=5.38
+    mem=`cat log/workerlog.0 | grep '50/5004' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=4.69785
+    ips_base=6670.45070
+    mem_base=0.81
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
@@ -426,12 +426,12 @@ function mocov2_resnet50_pt_in1k_1n8c_dp_fp32() {
     rm -rf log
     bash ./ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c.sh
 
-    loss=`cat log/workerlog.0 | grep '50/2502' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    loss=`cat log/workerlog.0 | grep '50/5004' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
-    mem=`cat log/workerlog.0 | grep '50/2502' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=-0.32798
-    ips_base=1731.37
-    mem_base=10.55
+    mem=`cat log/workerlog.0 | grep '50/5004' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=9.30424
+    ips_base=2369.80220
+    mem_base=3.38 
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }

From c88f2891ce3a7b26072c2bf156d6e9695bab784e Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Fri, 26 May 2023 09:32:36 +0000
Subject: [PATCH 13/18] add CI

---
 tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml b/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
index 3e599005..27befd50 100644
--- a/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
+++ b/tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
@@ -40,7 +40,7 @@ DataLoader:
   Train:
     dataset:
       name: ImageFolder
-      root: ./dataset/train
+      root: ./dataset/ILSVRC2012/train
       transform:
         - TwoViewsTransform:
             base_transform1:

From a158dc9042e5f064b0783248449611ed8536b89f Mon Sep 17 00:00:00 2001
From: MangoFF <939117440@qq.com>
Date: Fri, 26 May 2023 17:44:11 +0800
Subject: [PATCH 14/18] Update case.sh

---
 tests/CI/case.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index e082ce8b..d9eb52ec 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -429,8 +429,8 @@ function mocov2_resnet50_pt_in1k_1n8c_dp_fp32() {
     loss=`cat log/workerlog.0 | grep '50/5004' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
     mem=`cat log/workerlog.0 | grep '50/5004' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=9.30424
-    ips_base=2369.80220
+    loss_base=9.33314
+    ips_base=2076.1308
     mem_base=3.38 
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="

From 9bf4c23eb1174fab401dde9179794e2a782ba136 Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Fri, 26 May 2023 10:39:21 +0000
Subject: [PATCH 15/18] add model

---
 tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
index 04e3c83b..6fa11b76 100644
--- a/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
+++ b/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
@@ -29,3 +29,5 @@ python -m paddle.distributed.launch \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
     -o Global.flags.FLAGS_cudnn_deterministic=1 \
     -o DataLoader.Train.sampler.batch_size=64
+    -o Global.pretrained_model=./pretrained/mocov2/mocov2_latest_base_encoder
+

From 9f88fa44de4f8f080e48e1fb8b072fa69bbde54a Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Fri, 26 May 2023 10:51:12 +0000
Subject: [PATCH 16/18] add model

---
 tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
index 6fa11b76..564e98d5 100644
--- a/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
+++ b/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
@@ -28,6 +28,6 @@ python -m paddle.distributed.launch \
     -o Global.max_train_step=50 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
     -o Global.flags.FLAGS_cudnn_deterministic=1 \
-    -o DataLoader.Train.sampler.batch_size=64
+    -o DataLoader.Train.sampler.batch_size=64 \
     -o Global.pretrained_model=./pretrained/mocov2/mocov2_latest_base_encoder
 

From b49debb7d51a732ee640440080e6e01307edcf0d Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Fri, 26 May 2023 10:54:34 +0000
Subject: [PATCH 17/18] add model

---
 tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh | 2 +-
 tests/CI/ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
index 564e98d5..f17b52e4 100644
--- a/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
+++ b/tests/CI/ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c_dp.sh
@@ -23,7 +23,7 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
-    -c ../../tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml
+    -c ../../tasks/ssl/mocov2/configs/mocov2_resnet50_lp_in1k_1n8c.yaml \
     -o Global.print_batch_step=1 \
     -o Global.max_train_step=50 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
diff --git a/tests/CI/ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh b/tests/CI/ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh
index 381adf0c..1e7d0be0 100644
--- a/tests/CI/ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh
+++ b/tests/CI/ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c_dp.sh
@@ -23,7 +23,7 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
-    -c ../../tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml
+    -c ../../tasks/ssl/mocov2/configs/mocov2_resnet50_pt_in1k_1n8c.yaml \
     -o Global.print_batch_step=1 \
     -o Global.max_train_step=50 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \

From 594c2bff69433163e0f7ec937c9e45c8d0765ccd Mon Sep 17 00:00:00 2001
From: wangguo02 <939117440@qq.com>
Date: Fri, 26 May 2023 11:02:10 +0000
Subject: [PATCH 18/18] modify ci

---
 tests/CI/case.sh | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index d9eb52ec..fc2e27dd 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -411,12 +411,12 @@ function mocov2_resnet50_lp_in1k_1n8c_dp_fp32() {
     rm -rf log
     bash ./ssl/mocov2/mocov2_resnet50_lp_in1k_1n8c.sh
 
-    loss=`cat log/workerlog.0 | grep '50/5004' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    loss=`cat log/workerlog.0 | grep '49/2502' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
-    mem=`cat log/workerlog.0 | grep '50/5004' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=4.69785
-    ips_base=6670.45070
-    mem_base=0.81
+    mem=`cat log/workerlog.0 | grep '49/2502' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=4.12551
+    ips_base=6449.01604
+    mem_base=0.77
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
@@ -426,12 +426,12 @@ function mocov2_resnet50_pt_in1k_1n8c_dp_fp32() {
     rm -rf log
     bash ./ssl/mocov2/mocov2_resnet50_pt_in1k_1n8c.sh
 
-    loss=`cat log/workerlog.0 | grep '50/5004' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    loss=`cat log/workerlog.0 | grep '49/2502' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
-    mem=`cat log/workerlog.0 | grep '50/5004' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=9.33314
-    ips_base=2076.1308
-    mem_base=3.38 
+    mem=`cat log/workerlog.0 | grep '49/2502' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=10.05231
+    ips_base=2045.23616
+    mem_base=6.17 
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }