forked from PulseFocusPlatform/PulseFocusPlatform
416 lines
14 KiB
Python
416 lines
14 KiB
Python
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import os
|
|
import os.path as osp
|
|
import shutil
|
|
import requests
|
|
import tqdm
|
|
import hashlib
|
|
import binascii
|
|
import base64
|
|
import tarfile
|
|
import zipfile
|
|
|
|
from .voc_utils import create_list
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
__all__ = [
|
|
'get_weights_path', 'get_dataset_path', 'download_dataset',
|
|
'create_voc_list'
|
|
]
|
|
|
|
WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights/static")
|
|
DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset")
|
|
|
|
# dict of {dataset_name: (download_info, sub_dirs)}
|
|
# download info: [(url, md5sum)]
|
|
DATASETS = {
|
|
'coco': ([
|
|
(
|
|
'http://images.cocodataset.org/zips/train2017.zip',
|
|
'cced6f7f71b7629ddf16f17bbcfab6b2', ),
|
|
(
|
|
'http://images.cocodataset.org/zips/val2017.zip',
|
|
'442b8da7639aecaf257c1dceb8ba8c80', ),
|
|
(
|
|
'http://images.cocodataset.org/annotations/annotations_trainval2017.zip',
|
|
'f4bbac642086de4f52a3fdda2de5fa2c', ),
|
|
], ["annotations", "train2017", "val2017"]),
|
|
'voc': ([
|
|
(
|
|
'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',
|
|
'6cd6e144f989b92b3379bac3b3de84fd', ),
|
|
(
|
|
'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',
|
|
'c52e279531787c972589f7e41ab4ae64', ),
|
|
(
|
|
'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',
|
|
'b6e924de25625d8de591ea690078ad9f', ),
|
|
], ["VOCdevkit/VOC2012", "VOCdevkit/VOC2007"]),
|
|
'wider_face': ([
|
|
(
|
|
'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip',
|
|
'3fedf70df600953d25982bcd13d91ba2', ),
|
|
(
|
|
'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip',
|
|
'dfa7d7e790efa35df3788964cf0bbaea', ),
|
|
(
|
|
'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip',
|
|
'a4a898d6193db4b9ef3260a68bad0dc7', ),
|
|
], ["WIDER_train", "WIDER_val", "wider_face_split"]),
|
|
'fruit': ([(
|
|
'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar',
|
|
'baa8806617a54ccf3685fa7153388ae6', ), ],
|
|
['Annotations', 'JPEGImages']),
|
|
'roadsign_voc': ([(
|
|
'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar',
|
|
'8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']),
|
|
'roadsign_coco': ([(
|
|
'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar',
|
|
'49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']),
|
|
'objects365': (),
|
|
}
|
|
|
|
DOWNLOAD_RETRY_LIMIT = 3
|
|
|
|
|
|
def get_weights_path(url):
|
|
"""Get weights path from WEIGHT_HOME, if not exists,
|
|
download it from url.
|
|
"""
|
|
path, _ = get_path(url, WEIGHTS_HOME)
|
|
return path
|
|
|
|
|
|
def get_dataset_path(path, annotation, image_dir):
|
|
"""
|
|
If path exists, return path.
|
|
Otherwise, get dataset path from DATASET_HOME, if not exists,
|
|
download it.
|
|
"""
|
|
if _dataset_exists(path, annotation, image_dir):
|
|
return path
|
|
|
|
logger.info("Dataset {} is not valid for reason above, try searching {} or "
|
|
"downloading dataset...".format(
|
|
osp.realpath(path), DATASET_HOME))
|
|
|
|
data_name = os.path.split(path.strip().lower())[-1]
|
|
for name, dataset in DATASETS.items():
|
|
if data_name == name:
|
|
logger.debug("Parse dataset_dir {} as dataset "
|
|
"{}".format(path, name))
|
|
if name == 'objects365':
|
|
raise NotImplementedError(
|
|
"Dataset {} is not valid for download automatically. "
|
|
"Please apply and download the dataset from "
|
|
"https://www.objects365.org/download.html".format(name))
|
|
data_dir = osp.join(DATASET_HOME, name)
|
|
# For VOC-style datasets, only check subdirs
|
|
if name in ['voc', 'fruit', 'roadsign_voc']:
|
|
exists = True
|
|
for sub_dir in dataset[1]:
|
|
check_dir = osp.join(data_dir, sub_dir)
|
|
if osp.exists(check_dir):
|
|
logger.info("Found {}".format(check_dir))
|
|
else:
|
|
exists = False
|
|
if exists:
|
|
return data_dir
|
|
|
|
# voc exist is checked above, voc is not exist here
|
|
check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc'
|
|
for url, md5sum in dataset[0]:
|
|
get_path(url, data_dir, md5sum, check_exist)
|
|
|
|
# voc should create list after download
|
|
if name == 'voc':
|
|
create_voc_list(data_dir)
|
|
return data_dir
|
|
|
|
# not match any dataset in DATASETS
|
|
raise ValueError(
|
|
"Dataset {} is not valid and cannot parse dataset type "
|
|
"'{}' for automaticly downloading, which only supports "
|
|
"'voc' , 'coco', 'wider_face', 'fruit' and 'roadsign_voc' currently".
|
|
format(path, osp.split(path)[-1]))
|
|
|
|
|
|
def create_voc_list(data_dir, devkit_subdir='VOCdevkit'):
|
|
logger.debug("Create voc file list...")
|
|
devkit_dir = osp.join(data_dir, devkit_subdir)
|
|
year_dirs = [osp.join(devkit_dir, x) for x in os.listdir(devkit_dir)]
|
|
|
|
# NOTE: since using auto download VOC
|
|
# dataset, VOC default label list should be used,
|
|
# do not generate label_list.txt here. For default
|
|
# label, see ../data/source/voc.py
|
|
create_list(year_dirs, data_dir)
|
|
logger.debug("Create voc file list finished")
|
|
|
|
|
|
def map_path(url, root_dir):
|
|
# parse path after download to decompress under root_dir
|
|
fname = osp.split(url)[-1]
|
|
zip_formats = ['.zip', '.tar', '.gz']
|
|
fpath = fname
|
|
for zip_format in zip_formats:
|
|
fpath = fpath.replace(zip_format, '')
|
|
return osp.join(root_dir, fpath)
|
|
|
|
|
|
def get_path(url, root_dir, md5sum=None, check_exist=True):
|
|
""" Download from given url to root_dir.
|
|
if file or directory specified by url is exists under
|
|
root_dir, return the path directly, otherwise download
|
|
from url and decompress it, return the path.
|
|
|
|
url (str): download url
|
|
root_dir (str): root dir for downloading, it should be
|
|
WEIGHTS_HOME or DATASET_HOME
|
|
md5sum (str): md5 sum of download package
|
|
"""
|
|
# parse path after download to decompress under root_dir
|
|
fullpath = map_path(url, root_dir)
|
|
|
|
# For same zip file, decompressed directory name different
|
|
# from zip file name, rename by following map
|
|
decompress_name_map = {
|
|
"VOCtrainval_11-May-2012": "VOCdevkit/VOC2012",
|
|
"VOCtrainval_06-Nov-2007": "VOCdevkit/VOC2007",
|
|
"VOCtest_06-Nov-2007": "VOCdevkit/VOC2007",
|
|
"annotations_trainval": "annotations"
|
|
}
|
|
for k, v in decompress_name_map.items():
|
|
if fullpath.find(k) >= 0:
|
|
fullpath = osp.join(osp.split(fullpath)[0], v)
|
|
|
|
if osp.exists(fullpath) and check_exist:
|
|
# If fullpath is a directory, it has been decompressed
|
|
# checking MD5 is impossible, so we skip checking when
|
|
# fullpath is a directory here
|
|
if osp.isdir(fullpath) or \
|
|
_md5check_from_req(fullpath,
|
|
requests.get(url, stream=True)):
|
|
logger.debug("Found {}".format(fullpath))
|
|
return fullpath, True
|
|
else:
|
|
if osp.isdir(fullpath):
|
|
shutil.rmtree(fullpath)
|
|
else:
|
|
os.remove(fullpath)
|
|
|
|
fullname = _download(url, root_dir, md5sum)
|
|
|
|
# new weights format whose postfix is 'pdparams',
|
|
# which is not need to decompress
|
|
if osp.splitext(fullname)[-1] != '.pdparams':
|
|
_decompress(fullname)
|
|
|
|
return fullpath, False
|
|
|
|
|
|
def download_dataset(path, dataset=None):
|
|
if dataset not in DATASETS.keys():
|
|
logger.error("Unknown dataset {}, it should be "
|
|
"{}".format(dataset, DATASETS.keys()))
|
|
return
|
|
dataset_info = DATASETS[dataset][0]
|
|
for info in dataset_info:
|
|
get_path(info[0], path, info[1], False)
|
|
logger.debug("Download dataset {} finished.".format(dataset))
|
|
|
|
|
|
def _dataset_exists(path, annotation, image_dir):
|
|
"""
|
|
Check if user define dataset exists
|
|
"""
|
|
if not osp.exists(path):
|
|
logger.debug("Config dataset_dir {} is not exits, "
|
|
"dataset config is not valid".format(path))
|
|
return False
|
|
|
|
if annotation:
|
|
annotation_path = osp.join(path, annotation)
|
|
if not osp.exists(annotation_path):
|
|
logger.error("Config dataset_dir {} is not exits!".format(path))
|
|
|
|
if not osp.isfile(annotation_path):
|
|
logger.warning("Config annotation {} is not a "
|
|
"file, dataset config is not "
|
|
"valid".format(annotation_path))
|
|
return False
|
|
if image_dir:
|
|
image_path = osp.join(path, image_dir)
|
|
if not osp.exists(image_path):
|
|
logger.warning("Config dataset_dir {} is not exits!".format(path))
|
|
|
|
if not osp.isdir(image_path):
|
|
logger.warning("Config image_dir {} is not a "
|
|
"directory, dataset config is not "
|
|
"valid".format(image_path))
|
|
return False
|
|
return True
|
|
|
|
|
|
def _download(url, path, md5sum=None):
|
|
"""
|
|
Download from url, save to path.
|
|
|
|
url (str): download url
|
|
path (str): download to given path
|
|
"""
|
|
if not osp.exists(path):
|
|
os.makedirs(path)
|
|
|
|
fname = osp.split(url)[-1]
|
|
fullname = osp.join(path, fname)
|
|
retry_cnt = 0
|
|
|
|
while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
|
|
if retry_cnt < DOWNLOAD_RETRY_LIMIT:
|
|
retry_cnt += 1
|
|
else:
|
|
raise RuntimeError("Download from {} failed. "
|
|
"Retry limit reached".format(url))
|
|
|
|
logger.info("Downloading {} from {}".format(fname, url))
|
|
|
|
req = requests.get(url, stream=True)
|
|
if req.status_code != 200:
|
|
raise RuntimeError("Downloading from {} failed with code "
|
|
"{}!".format(url, req.status_code))
|
|
|
|
# For protecting download interupted, download to
|
|
# tmp_fullname firstly, move tmp_fullname to fullname
|
|
# after download finished
|
|
tmp_fullname = fullname + "_tmp"
|
|
total_size = req.headers.get('content-length')
|
|
with open(tmp_fullname, 'wb') as f:
|
|
if total_size:
|
|
for chunk in tqdm.tqdm(
|
|
req.iter_content(chunk_size=1024),
|
|
total=(int(total_size) + 1023) // 1024,
|
|
unit='KB'):
|
|
f.write(chunk)
|
|
else:
|
|
for chunk in req.iter_content(chunk_size=1024):
|
|
if chunk:
|
|
f.write(chunk)
|
|
|
|
# check md5 after download in Content-MD5 in req.headers
|
|
if _md5check_from_req(tmp_fullname, req):
|
|
shutil.move(tmp_fullname, fullname)
|
|
return fullname
|
|
else:
|
|
logger.warning(
|
|
"Download from url imcomplete, try downloading again...")
|
|
os.remove(tmp_fullname)
|
|
continue
|
|
|
|
|
|
def _md5check_from_req(weights_path, req):
|
|
# For weights in bcebos URLs, MD5 value is contained
|
|
# in request header as 'content_md5'
|
|
content_md5 = req.headers.get('content-md5')
|
|
if not content_md5 or _md5check(
|
|
weights_path,
|
|
binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode(
|
|
)):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def _md5check(fullname, md5sum=None):
|
|
if md5sum is None:
|
|
return True
|
|
|
|
logger.debug("File {} md5 checking...".format(fullname))
|
|
md5 = hashlib.md5()
|
|
with open(fullname, 'rb') as f:
|
|
for chunk in iter(lambda: f.read(4096), b""):
|
|
md5.update(chunk)
|
|
calc_md5sum = md5.hexdigest()
|
|
|
|
if calc_md5sum != md5sum:
|
|
logger.warning("File {} md5 check failed, {}(calc) != "
|
|
"{}(base)".format(fullname, calc_md5sum, md5sum))
|
|
return False
|
|
return True
|
|
|
|
|
|
def _decompress(fname):
|
|
"""
|
|
Decompress for zip and tar file
|
|
"""
|
|
logger.info("Decompressing {}...".format(fname))
|
|
|
|
# For protecting decompressing interupted,
|
|
# decompress to fpath_tmp directory firstly, if decompress
|
|
# successed, move decompress files to fpath and delete
|
|
# fpath_tmp and remove download compress file.
|
|
fpath = osp.split(fname)[0]
|
|
fpath_tmp = osp.join(fpath, 'tmp')
|
|
if osp.isdir(fpath_tmp):
|
|
shutil.rmtree(fpath_tmp)
|
|
os.makedirs(fpath_tmp)
|
|
|
|
if fname.find('tar') >= 0:
|
|
with tarfile.open(fname) as tf:
|
|
tf.extractall(path=fpath_tmp)
|
|
elif fname.find('zip') >= 0:
|
|
with zipfile.ZipFile(fname) as zf:
|
|
zf.extractall(path=fpath_tmp)
|
|
else:
|
|
raise TypeError("Unsupport compress file type {}".format(fname))
|
|
|
|
for f in os.listdir(fpath_tmp):
|
|
src_dir = osp.join(fpath_tmp, f)
|
|
dst_dir = osp.join(fpath, f)
|
|
_move_and_merge_tree(src_dir, dst_dir)
|
|
|
|
shutil.rmtree(fpath_tmp)
|
|
os.remove(fname)
|
|
|
|
|
|
def _move_and_merge_tree(src, dst):
|
|
"""
|
|
Move src directory to dst, if dst is already exists,
|
|
merge src to dst
|
|
"""
|
|
if not osp.exists(dst):
|
|
shutil.move(src, dst)
|
|
elif osp.isfile(src):
|
|
shutil.move(src, dst)
|
|
else:
|
|
for fp in os.listdir(src):
|
|
src_fp = osp.join(src, fp)
|
|
dst_fp = osp.join(dst, fp)
|
|
if osp.isdir(src_fp):
|
|
if osp.isdir(dst_fp):
|
|
_move_and_merge_tree(src_fp, dst_fp)
|
|
else:
|
|
shutil.move(src_fp, dst_fp)
|
|
elif osp.isfile(src_fp) and \
|
|
not osp.isfile(dst_fp):
|
|
shutil.move(src_fp, dst_fp)
|