按8:2切分数据集将数据集分成0.8训练集和0.2的测试集
import os
import random
import shutil
annotations_dir = '/home/dataset/VOC2007/Annotations'
images_dir = '/home/dataset/VOC2007/JPEGImages'
train_annotations_dir = '/home/dataset/VOC2007_new/train/Annotations'
train_images_dir = '/home/dataset/VOC2007_new/train/JPEGImages'
val_annotations_dir = '/home/dataset/VOC2007_new/val/Annotations'
val_images_dir = '/home/dataset/VOC2007_new/val/JPEGImages'
os.makedirs(train_annotations_dir, exist_ok=True)
os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(val_annotations_dir, exist_ok=True)
os.makedirs(val_images_dir, exist_ok=True)
xml_files = os.listdir(annotations_dir)
jpg_files = os.listdir(images_dir)
random.shuffle(xml_files)
num_train = int(0.8 * len(xml_files))
num_val = len(xml_files) - num_train
for xml_file in xml_files[:num_train]:
img_file = xml_file.replace('.xml', '.jpg').lower()
xml_file = xml_file.lower()
if img_file in jpg_files:
shutil.move(os.path.join(annotations_dir, xml_file), os.path.join(train_annotations_dir, xml_file))
shutil.move(os.path.join(images_dir, img_file), os.path.join(train_images_dir, img_file))
jpg_files.remove(img_file)
for xml_file in xml_files[num_train:]:
img_file = xml_file.replace('.xml', '.jpg').lower()
xml_file = xml_file.lower()
if img_file in jpg_files:
shutil.move(os.path.join(annotations_dir, xml_file), os.path.join(val_annotations_dir, xml_file))
shutil.move(os.path.join(images_dir, img_file), os.path.join(val_images_dir, img_file))
jpg_files.remove(img_file)
print("数据集分割完成!")
VOC转COCO格式数据集样例代码
import os
import json
import xml.etree.ElementTree as ET
import glob
PRE_DEFINE_CATEGORIES = None
START_BOUNDING_BOX_ID = 2
def get(root, name):
vars = root.findall(name)
return vars
def get_and_check(root, name, length):
vars = root.findall(name)
if len(vars) == 0:
raise ValueError("Can not find %s in %s." % (name, root.tag))
if length > 0 and len(vars) != length:
raise ValueError(
"The size of %s is supposed to be %d, but is %d."
% (name, length, len(vars))
)
if length == 1:
vars = vars[0]
return vars
def get_filename_as_int(filename):
try:
filename = filename.replace("\\", "/")
filename = os.path.splitext(os.path.basename(filename))[0]
return int(filename)
except:
print("filename error" + filename)
raise ValueError("Filename %s is supposed to be an integer." % (filename))
def get_categories(xml_files):
"""Generate category name to id mapping from a list of xml files.
Arguments:
xml_files {list} -- A list of xml file paths.
Returns:
dict -- category name to id mapping.
"""
classes_names = []
for xml_file in xml_files:
tree = ET.parse(xml_file)
root = tree.getroot()
for member in root.findall("object"):
classes_names.append(member[0].text)
classes_names = list(set(classes_names))
classes_names.sort()
return {name: i for i, name in enumerate(classes_names)}
def convert(xml_files, json_file):
json_dict = {"images": [], "type": "instances", "annotations": [], "categories": []}
if PRE_DEFINE_CATEGORIES is not None:
categories = PRE_DEFINE_CATEGORIES
else:
categories = get_categories(xml_files)
bnd_id = START_BOUNDING_BOX_ID
for xml_file in xml_files:
tree = ET.parse(xml_file)
root = tree.getroot()
path = get(root, "path")
if len(path) == 1:
filename = os.path.basename(path[0].text)
elif len(path) == 0:
filename = get_and_check(root, "filename", 1).text
else:
raise ValueError("%d paths found in %s" % (len(path), xml_file))
filename = get_and_check(root, "filename", 1).text
image_id = ""
try :
image_id = get_filename_as_int(filename)
except Exception as e:
print("error file name is " + xml_file)
size = get_and_check(root, "size", 1)
width = int(get_and_check(size, "width", 1).text)
height = int(get_and_check(size, "height", 1).text)
image = {
"file_name": filename,
"height": height,
"width": width,
"id": image_id,
}
json_dict["images"].append(image)
for obj in get(root, "object"):
category = get_and_check(obj, "name", 1).text
if category not in categories:
new_id = len(categories)
categories[category] = new_id
category_id = categories[category]
bndbox = get_and_check(obj, "bndbox", 1)
xmin = int(get_and_check(bndbox, "xmin", 1).text) - 1
ymin = int(get_and_check(bndbox, "ymin", 1).text) - 1
xmax = int(get_and_check(bndbox, "xmax", 1).text)
ymax = int(get_and_check(bndbox, "ymax", 1).text)
assert xmax > xmin
assert ymax > ymin
o_width = abs(xmax - xmin)
o_height = abs(ymax - ymin)
ann = {
"area": o_width * o_height,
"iscrowd": 0,
"image_id": image_id,
"bbox": [xmin, ymin, o_width, o_height],
"category_id": category_id,
"id": bnd_id,
"ignore": 0,
"segmentation": [],
}
json_dict["annotations"].append(ann)
bnd_id = bnd_id + 1
for cate, cid in categories.items():
cat = {"supercategory": "none", "id": cid, "name": cate}
json_dict["categories"].append(cat)
json_fp = open(json_file, "w")
json_str = json.dumps(json_dict)
json_fp.write(json_str)
json_fp.close()
print('categories:',categories)
if __name__ == "__main__":
xml_dir = "/home/dataset/VOC2007_new/val/Annotations"
json_file = "/home/dataset/coco_bug/annotations/instances_val2017.json"
os.makedirs('/home/dataset/coco_bug/annotations', exist_ok=True)
path_xml = os.path.join(xml_dir, "*.xml")
xml_files = glob.glob(path_xml)
print("Number of xml files: {}".format(len(xml_files)))
convert(xml_files, json_file)
print("Success: {}".format(json_file))
COCO数据集转VOC数据集
from pycocotools.coco import COCO
import shutil
import os
def convert(size, box):
dw = 1. / size[0]
dh = 1. / size[1]
x = box[0] + box[2] / 2.0
y = box[1] + box[3] / 2.0
w = box[2]
h = box[3]
x = x * dw
w = w * dw
y = y * dh
h = h * dh
return (x, y, w, h)
def get_classes_and_index(path):
D = {}
f = open(path)
for line in f:
temp = line.rstrip().split(',', 2)
print("temp[0]:" + temp[0] + "\n")
print("temp[1]:" + temp[1] + "\n")
D[temp[1]] = temp[0]
return D
def coco2yolo(dataType):
annFile = './annotations/instances_%s.json' % dataType
classes = get_classes_and_index('./coco_class.txt')
if not os.path.exists('./images'):
os.makedirs('./images')
os.symlink(os.path.abspath(dataType), './images/%s' % dataType)
if not os.path.exists('./labels/%s' % dataType):
os.makedirs('./labels/%s' % dataType)
else:
shutil.rmtree('./labels/%s' % dataType)
os.makedirs('./labels/%s' % dataType)
coco = COCO(annFile)
list_file = open('%s.txt' % dataType, 'w')
imgIds = coco.getImgIds()
catIds = coco.getCatIds()
for imgId in imgIds:
objCount = 0
Img = coco.loadImgs(imgId)[0]
filename = Img['file_name']
width = Img['width']
height = Img['height']
annIds = coco.getAnnIds(imgIds=imgId, catIds=catIds, iscrowd=None)
for annId in annIds:
anns = coco.loadAnns(annId)[0]
catId = anns['category_id']
cat = coco.loadCats(catId)[0]['name']
if cat in classes:
objCount = objCount + 1
out_file = open('labels/%s/%s.txt' % (dataType, filename[:-4]), 'a')
cls_id = classes[cat]
box = anns['bbox']
size = [width, height]
bb = convert(size, box)
out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
out_file.close()
list_file.write('./images/%s/%s\n' % (dataType, filename))
list_file.close()
if __name__ == '__main__':
coco2yolo('train2017')
coco2yolo('val2017')