-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconversion.py
168 lines (140 loc) · 4.96 KB
/
conversion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# -*- coding: utf-8 -*-
"""Convert the Yelp Dataset Challenge dataset from json format to csv.
For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge
"""
import argparse
import collections
import csv
import simplejson as json
import sys, getopt
def read_and_write_file(json_file_path, csv_file_path, column_names):
"""Read in the json dataset file and write it out to a csv file, given the column names."""
# count = 0
with open(csv_file_path, 'w', newline='') as fout:
csv_file = csv.writer(fout)
csv_file.writerow(list(column_names))
with open(json_file_path, encoding='utf-8') as fin:
for index, line in enumerate(fin):
# print(index)
line_contents = json.loads(line)
# print("Json contents:", line_contents)
# print("CSV contents:", get_row(line_contents, column_names))
# count += 1
# if count > 2:
# break
s = get_row(line_contents, column_names)
s = [val.encode('unicode_escape').decode() if isinstance(val, str) else val for val in s]
try:
csv_file.writerow(s)
except UnicodeEncodeError:
print(s)
print("Unicode Error")
def get_superset_of_column_names_from_file(json_file_path):
"""Read in the json dataset file and return the superset of column names."""
column_names = set()
with open(json_file_path, encoding='utf-8') as fin:
for line in fin:
line_contents = json.loads(line)
column_names.update(
set(get_column_names(line_contents).keys())
)
return column_names
def get_column_names(line_contents, parent_key=''):
"""Return a list of flattened key names given a dict.
Example:
line_contents = {
'a': {
'b': 2,
'c': 3,
},
}
will return: ['a.b', 'a.c']
These will be the column names for the eventual csv file.
"""
column_names = []
# for k, v in line_contents.iteritems():
for k, v in line_contents.items():
column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
if isinstance(v, collections.MutableMapping):
column_names.extend(
get_column_names(v, column_name).items()
)
else:
column_names.append((column_name, v))
return dict(column_names)
def get_nested_value(d, key):
"""Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
Example:
d = {
'a': {
'b': 2,
'c': 3,
},
}
key = 'a.b'
will return: 2
"""
if not d:
return None
if '.' not in key:
if key not in d:
return None
return d[key]
base_key, sub_key = key.split('.', 1)
if base_key not in d:
return None
sub_dict = d[base_key]
return get_nested_value(sub_dict, sub_key)
def get_row(line_contents, column_names):
"""Return a csv compatible row given column names and a dict."""
row = []
for column_name in column_names:
line_value = get_nested_value(
line_contents,
column_name,
)
# print("Column name", column_name)
# print("Line value", line_value)
row.append(line_value)
# if isinstance(line_value, str):
# row.append('{0}'.format(line_value.encode('utf-8')))
# elif line_value is not None:
# row.append('{0}'.format(line_value))
# else:
# row.append('')
return row
def main(argv):
# json_file = "yelp_dataset/yelp_academic_dataset_business.json"
json_file = "yelp_dataset/yelp_academic_dataset_photo.json"
try:
opts, args = getopt.getopt(argv, "hi:", ["ifile="])
except getopt.GetoptError:
print('conversion.py -i <inputfile>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('test.py -i <inputfile> -o <outputfile>')
sys.exit()
elif opt in ("-i", "--ifile"):
json_file = arg
csv_file = '{0}.csv'.format(json_file.split('.json')[0])
column_names = get_superset_of_column_names_from_file(json_file)
read_and_write_file(json_file, csv_file, column_names)
print("Finished", json_file)
if __name__ == '__main__':
"""Convert a yelp dataset file from json to csv."""
"""TODO: Convert all the json files"""
# parser = argparse.ArgumentParser(
# description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
# )
#
# parser.add_argument(
# 'json_file',
# type=str,
# help='The json file to convert.',
# )
#
# args = parser.parse_args()
#
# json_file = args.json_file
main(sys.argv[1:])