预训练模型不能使用 TPU

sun1638650145 · May 9, 2020, 12:07pm

Colab 的 TPU 资源为什么使用 keras.application 和 tfhub.KerasLayer 的模型一训练就会不能正确加载模型，导致无法训练？官网的文档写的花里胡哨的，直接看不出来到底能不能用这些预训练模型啊

huan · May 20, 2020, 2:45pm

你是否可以提供一个重现这个问题的最小代码 Colab 共享地址？

如果可以的话，将会非常有助于社区里面的其他开发者快速的了解到你的代码和面临的情况，并且有更大的可能性你这个问题会得到解答哦。

sun1638650145 · August 16, 2020, 10:48am

老师升级了 tf2.3 用 tfhub 或者 keras.application 的模型还是报错，之前没用过分布式训练报错也不知道怎么调啊，我感觉应该是我的用法又问题

# tf2.3
# 数据集 http://qavs3w9z3.bkt.clouddn.com/refuse_classification.zip
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout
from tensorflow.keras.applications import EfficientNetB4, EfficientNetB7
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

CKPT_DIR = './ckpt/'
DATASET_DIR = './train/'
CSV_PATH = './train.csv'
VALIDATION_SPLIT = 0.1
LEARNING_RATE = 1e-5
EPOCHS = 100
BATCH_SIZE = 4
NUM_OF_CLASSES = 6
MODEL_IMAGE_SIZE = (224, 224, 3)
PROCESS_IMAGE_SIZE = (224, 224)
CALLBACKS = [
    ModelCheckpoint (filepath=CKPT_DIR+'model.{epoch:04d}-{val_accuracy:.04f}.h5',
                    monitor='val_accuracy',
                    verbose=1,
                    period=2),
    TensorBoard (log_dir='./logs/{}'.format ('train'),
                histogram_freq=1,
                write_graph=True,
                update_freq='epoch'),
    EarlyStopping (monitor='val_accuracy',
                  min_delta=1e-4,
                  patience=5,
                  verbose=1),
]


def set_gpus ():
    """设置物理 GPU 选项"""
    gpus = tf.config.list_physical_devices (device_type='GPU')
    if len (gpus) != 0:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth (device=gpu, enable=True)


def efficientnet (input_shape, num_of_classes):
    """构建模型"""

    input_layer = Input (shape=input_shape, name='Input-Layer')
    efficient_layer = EfficientNetB4 (include_top=False,
                                     weights='imagenet',
                                     input_tensor=input_layer)(input_layer)
    flatten_layer = Flatten (name='Flatten-Layer')(efficient_layer)
    dense_layer = Dense (units=128, activation='relu', name='Dense-Layer')(flatten_layer)
    dropout_layer = Dropout (rate=0.5, name='Dropout-Layer')(dense_layer)
    output_layer = Dense (units=num_of_classes, activation='softmax', name='Output-Layer')(dropout_layer)

    model = Model (inputs=input_layer, outputs=output_layer)

    return model


def get_dataframe (csv_path):
    """得到 dataframe"""
    dataframe = pd.read_csv (csv_path, index_col=None)
    dataframe ['label'] = dataframe ['label'].astype (str)
    dataframe = dataframe.sample (frac=1)

    return dataframe


def load_preprocess (image_size, batch_size, dataset_dir, csv_path, validation_split=0.2):
    """对图像进行预处理"""

    datagen = ImageDataGenerator (rotation_range=30,
                                 width_shift_range=0.2,
                                 height_shift_range=0.2,
                                 shear_range=0.2,
                                 zoom_range=0.2,
                                 channel_shift_range=10,
                                 horizontal_flip=True,
                                 fill_mode='nearest')

    df = get_dataframe (csv_path)
    if validation_split > 0:
        train_indices = int ((1 - validation_split) * df.shape [0])
        train_df = df.iloc [0:train_indices]
        validation_df = df.iloc [train_indices:]

        train_batches = datagen.flow_from_dataframe (dataframe=train_df,
                                                    directory=dataset_dir,
                                                    x_col='filename',
                                                    y_col='label',
                                                    target_size=image_size,
                                                    interpolation='bicubic',
                                                    class_mode='categorical',
                                                    shuffle=True,
                                                    batch_size=batch_size,
                                                    validate_filenames=True)
        validation_batches = datagen.flow_from_dataframe (dataframe=validation_df,
                                                         directory=dataset_dir,
                                                         x_col='filename',
                                                         y_col='label',
                                                         target_size=image_size,
                                                         interpolation='bicubic',
                                                         class_mode='categorical',
                                                         shuffle=True,
                                                         batch_size=batch_size,
                                                         validate_filenames=True)
        return train_batches, validation_batches
    else:
        batches = datagen.flow_from_dataframe (dataframe=df,
                                              directory=dataset_dir,
                                              x_col='filename',
                                              y_col='label',
                                              target_size=image_size,
                                              interpolation='bicubic',
                                              class_mode='categorical',
                                              shuffle=True,
                                              batch_size=batch_size,
                                              validate_filenames=True)

        return batches


if __name__ == '__main__':
    # 设置 GPU 显存
    # set_gpus ()

    # 设置 TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver ()
    tf.config.experimental_connect_to_cluster (tpu)
    tf.tpu.experimental.initialize_tpu_system (tpu)
    strategy = tf.distribute.TPUStrategy (tpu)

    # 实例化模型
    with strategy.scope ():
        if os.path.exists (CKPT_DIR) is False:
            os.mkdir (CKPT_DIR)
        model = efficientnet (MODEL_IMAGE_SIZE, 6)
        model.summary ()
        model.compile (optimizer=Adam (lr=LEARNING_RATE),
                      loss=CategoricalCrossentropy (from_logits=True),
                      metrics=['accuracy'])

    # 加载数据
    batches = load_preprocess (PROCESS_IMAGE_SIZE, BATCH_SIZE, DATASET_DIR, CSV_PATH, 0.1)
    print (batches [0].class_indices)
    print (batches [1].class_indices)

    try:
        # 训练模型并保存
        model.fit (x=batches [0],
                  batch_size=BATCH_SIZE,
                  epochs=EPOCHS,
                  verbose=1,
                  callbacks=CALLBACKS,
                  validation_data=batches [1])
        model.save ('./model.h5')
    except KeyboardInterrupt:
        # 提前结束也会保存一个模型
        model.save ('./model.h5')

chenglu · August 31, 2020, 2:45am

@huan 抽空看一下？

huan · October 9, 2020, 3:07am

下面是 tf.wiki 书稿的 TPU 章节 GitHub Repo ：

建议你先把最基本的 TPU 使用代码顺利跑起来，并基于此进行后续的调整和修改。

《简明的 TensorFlow 2》 TPU 章节的 colab 例子在这里： https://colab.research.google.com/github/huan/tensorflow-handbook-tpu/blob/master/tensorflow-handbook-tpu-example.ipynb

在成功运行例子之后，可以将例子改为你所需要的功能，注意进行一点一点的功能增加，随时进行调试，尽早发现问题。

如果遇到了无法解决的问题，希望寻求他人帮忙，建议至少提供一个可以重现问题现场的 Google Colab notebook 地址。