-
Notifications
You must be signed in to change notification settings - Fork 0
/
xml2json0.py
94 lines (71 loc) · 2.96 KB
/
xml2json0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
Create initial json file
INPUT: /Users/susanaparis/Documents/Belgium/IMAGES_plus_TEXT/DATASETS/dress_attributes/data/images
Note: we use the images directory and not the xml directory because images have been cleaned for duplicates
and they are guaranteed to have the image, whereas the xml does not know which images are not present
OUTPUT: data0.json
Notes:
xml files live in
/Users/susanaparis/Documents/Belgium/IMAGES_plus_TEXT/DATASETS/dress_attributes/data/xml
images live in
/Users/susanaparis/Documents/Belgium/IMAGES_plus_TEXT/DATASETS/dress_attributes/data/images
"""
import os
import json
from amazon_xml_item import AmazonXMLItem
def init_dress_dict():
dress = {}
fields = ['imgid', 'asin', 'folder', 'url', 'title', 'brand', 'color', 'features', 'editorial', 'other']
for f in fields:
if f == 'imgid':
dress[f] = 0
elif f == 'features':
dress[f] = []
elif f == 'other':
dress[f] = {}
else:
dress[f] = ''
return dress
# get xml files for each directory
# check directories for files
r_xml_path = '/Users/susanaparis/Documents/Belgium/IMAGES_plus_TEXT/DATASETS/dress_attributes/data/xml/'
folders = [f for f in os.listdir(r_xml_path) if not f.startswith(".")]
r_img_path = '/Users/susanaparis/Documents/Belgium/IMAGES_plus_TEXT/DATASETS/dress_attributes/data/images/'
data = {}
data['dresses'] = [] # a list of dictionaries
data['asin2imgid2folder'] = {}
data['folder2asin2imgid'] = {}
imgid = 0
for folder in folders:
# TODO: consider having a separate file (csv or txt) with all the
# images to process, instead of reading this directory
img_path = r_img_path + folder
asins = [f for f in os.listdir(img_path) if not f.startswith(".")] # take the asins from the image folder
xml_path = r_xml_path + folder
for asin in asins:
# initialize dress dictionary
dress = init_dress_dict()
xml_file_name = xml_path + '/' + asin.replace(".jpg", "") + '.xml'
axml = AmazonXMLItem(xml_file_name)
# populate dress dictionary with info from the xml
dress['imgid'] = imgid
dress['asin'] = axml.get_asin()
dress['folder'] = folder
dress['title'] = axml.get_title()
dress['url'] = axml.get_img_url()
dress['brand'] = axml.get_brand()
dress['features'] = axml.get_all_features()
dress['editorial'] = axml.get_editorial()
dress['other'] = axml.get_attributes()
imgid += 1
data['dresses'].append(dress)
asin = asin.replace(".jpg", "")
assert asin == dress['asin']
data['asin2imgid2folder'][asin] = (imgid, folder)
if folder not in data['folder2asin2imgid']:
data['folder2asin2imgid'][folder] = []
data['folder2asin2imgid'][folder].append((asin, imgid))
# with open('data1.json', 'wb') as fp:
# json.dump(data, fp)
with open('data1.json', 'wb') as fp:
json.dump(data, fp, indent=4, sort_keys=True)