-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconstruct_user_data.py
102 lines (87 loc) · 2.78 KB
/
construct_user_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
import gzip
import numpy as np
import json
import tqdm
import random
import collections
import time
random.seed(2020)
def read_user_list(path):
"""
return: dict{org_id: remap_id} type: {str: str}
"""
lines = open(path, 'r').readlines()
user_dict = dict()
max_user = 0
for idx, line in enumerate(lines):
if idx == 0:
continue
l = line.strip()
tmp = l.split()
user_dict[tmp[0]] = tmp[1]
max_user = max(max_user, int(tmp[1]))
return user_dict, max_user
def read_item_list(path):
"""
return: dict{org_id: remap_id} type: {str: str}
"""
lines = open(path, 'r').readlines()
item_dict = dict()
for idx, line in enumerate(lines):
if idx == 0:
continue
l = line.strip()
tmp = l.split()
item_dict[tmp[0]] = tmp[1]
# item_dict[tmp[0]] = str(idx - 1)
return item_dict
def read_attribute_list(path):
"""
return: dict{org_id: remap_id} type: {str: str}
"""
lines = open(path, 'r').readlines()
attribute_dict = dict()
for idx, line in enumerate(lines):
if idx == 0:
continue
l = line.strip()
tmp = l.split()
attribute_dict[tmp[0] + tmp[2]] = tmp[1]
# item_dict[tmp[0]] = str(idx - 1)
return attribute_dict
def create_attribute_kg_lfm(path):
print('start collect attribute_kg')
user_dict ,max_user= read_user_list(path + 'user_list.txt')
print(len(user_dict))
max_user = max_user + 1
attribute_dict = read_attribute_list(path + 'attribute_list.txt')
with open(path + 'attribute_list_final.txt', 'w') as f:
f.writelines('org_id remap_id\n')
for user in user_dict:
f.writelines(user + ' ' + user_dict[user] + '\n')
for attribute in attribute_dict:
f.writelines(attribute + ' ' + str(int(attribute_dict[attribute])+ max_user) + '\n')
triples_lines = open(path+'attribute_kg.txt', 'r').readlines()
with open(path + 'attribute_kg_final.txt', 'w') as f:
for idx, line in enumerate(triples_lines):
l = line.strip()
temp = l.split()
temp[2] = str(int(temp[2]) + max_user)
f.writelines(temp[0] + ' ' + temp[1] + ' ' + temp[2] + '\n')
def construct_user_data():
dataset = 'yelp2018' # 'amazon-book', 'last-fm', 'yelp2018'
if dataset == 'amazon-book':
path = './datasets/amazon-book/'
elif dataset == 'last-fm':
path = './datasets/last-fm/'
create_attribute_kg_lfm(path)
elif dataset == 'yelp2018':
path = './datasets/yelp2018/'
create_attribute_kg_lfm(path)
else:
path = ''
print('没有找到数据集')
print(path)
if __name__ == '__main__':
construct_user_data()