-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy path普通版
106 lines (97 loc) · 6.41 KB
/
普通版
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gevent
import gevent.monkey
gevent.monkey.patch_all()
import requests
from lxml import etree
import re
import json
word_lists=["相宜本草"]#"悦诗风吟", "碧欧泉","自然堂", "百雀羚", "美宝莲", "伊蒂之屋", "施华蔻", "滋源", "丝蕴", "阿道夫", "吕 ryo", "黛丝恩", "芙丽芳丝", "碧柔", "妮维雅 洁面","力士", "菲诗小铺", "安宝笛", "妮维雅", "曼秀雷敦", "高夫"]
# key_word="资生堂"
for j in range(len(word_lists)):
# def main(word):
url="https://list.tmall.com/search_product.htm?q={}".format(word_lists[j])
print(url)
# try:
html=requests.get(url).text
# print(html)
html1=etree.HTML(html)
print(len(html1.xpath('//div[@class="product-iWrap"]')))
print("++++++++++++++++++")
for i in range(3):
# for i in range(len(html1.xpath('//div[@class="product-iWrap"]'))):
product_url=html1.xpath('//div[@class="product-iWrap"]//*[@class="productTitle" or contains(@class,"productTitle")]/a[1]/@href')[i]
print(product_url)
product_id=re.findall('[&\?]id=(.*?)&',product_url,re.S)[0]
print(product_id)
sell_id=re.findall('user_id=(.*?)&',product_url,re.S)[0]
url1 = "https://detail.tmall.com/item.htm?id={}".format(product_id)
product_title1=html1.xpath('//div[@class="product-iWrap"]//*[@class="productTitle" or contains(@class,"productTitle")]/a[1]')[i]
product_title=product_title1.xpath("string(.)").strip()
print(product_title)
print(product_title,product_id,sell_id,product_url,"\n")
for m in range(23,33):
# url="https://rate.tmall.com/list_detail_rate.htm"
url2 = "https://rate.tmall.com/list_detail_rate.htm?itemId={}&sellerId={}¤tPage={}".format(product_id, sell_id,m + 1)
print(url2)
headers1 = {
"authority": "rate.tmall.com",
"method": "GET",
"scheme": "https",
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "cna=Y/gvEkEdxn8CAcookie15=VFC%2FuZ9ayeYq2g%3D%3D; uc3=vt3=F8dByRMGABk%2F9ypnSPI%3D&id2=VAFaC64q7QnX&nk2=okKCgOPi&lg2=V32FPkk%2Fw0dUvg%3D%3D; tracknick=%5Cu6D9F%5Cu6F2A%5Cu5784; lid=%E6%B6%9F%E6%BC%AA%E5%9E%84; _l_g_=Ug%3D%3D; unb=705807873; lgc=%5Cu6D9F%5Cu6F2A%5Cu5784; cookie1=VAYrG4g5P7m343E16lVP62IC5DHsXMLSw0SctPbwLR0%3D; login=true; cookie17=VAFaC64q7QnX; cookie2=157e6a74cd30d191b9059b3db4528c71; _nk_=%5Cu6D9F%5Cu6F2A%5Cu5784; t=9112f19023b58d6a7e797c127de1caa6; sg=%E5%9E%8436; csg=fc322464; _tb_token_=e319fb776eb5e; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; whl=-1%260%260%260; x5sec=7b22726174656d616e616765723b32223a226131663338313366373636376139386230353064393932333633393736383537434c4f436c2b4546454c6134694d75486874477235414561437a63774e5467774e7a67334d7a7378227d; l=aBZu-uDGysaSOrzKzMaY_MNDH707gvfPNK5y1MayQTEhNP3AkeCXjJtoWMsLn_DX7ofhOouHI-a..; isg=BHZ2kDnRNuT9DcIyeWDIPEVfx6y4P7uuVKNbheBffdn2Ixe9SCXd4cbVP7_qjbLp",
"referer": "https://detail.tmall.com/item.htm?spm=a1z10.3-b-s.w4011-15884659123.103.42684d47xxbPpL&id=567935633478&rn=b030d97bc9c089d8c7ed425dc8ca7805&abbucket=17&skuId=3625875477451",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
}
cotents_all = {}
html2 = requests.get(url2, headers=headers1).text
html2 = re.findall('jsonp.*?\((.*)\)', html2, re.S)[0]
new_html2 = json.loads(html2)
print(html2)
page_num = round(int(new_html2['rateDetail']['rateCount']['total']) / 20)
for w in range(len(new_html2['rateDetail']['rateList'])):
# for j in range(3):
name = new_html2['rateDetail']['rateList'][w]['displayUserNick']
user_id = new_html2['rateDetail']['rateList'][w]['id']
time = new_html2['rateDetail']['rateList'][w]['rateDate']
content = new_html2['rateDetail']['rateList'][w]['rateContent']
cotents_all['url'] = url1
cotents_all['id'] = user_id
cotents_all['context'] = product_title
cotents_all['content'] = content
cotents_all['tC'] = time
cotents_all['user'] = {"displayName": name}
print(w + 1, name, time, content)
try:
# response=requests.post(url="http://s/bulk",json=lists_all)
headers2 = {'Content-Type': 'application/json'}
# headers2 = {"Content-Type": "application/x-www-form-urlencoded"}
# data=json.dumps(lists_all)
# response=requests.post(url="http://servicees/bulk",headers=headers2,json=json.dumps(lists_all,ensure_ascii=False))
response = requests.post(url="http://servlk/tmall", headers=headers2,
json=cotents_all)
print(response.status_code)
print("++++++++++++++++")
except:
pass
# except:
# pass
# if __name__ == "__main__":
# # content_all =["资生堂","红腰子","新透白","PK107","shiseido"]
# word_lists = ["欧珀莱", "UNO", "悠莱", "珊珂", "泊美", "ZA", "水之密语", "丝蓓绮", "玛馨妮", "可悠然", "惠润", "海蓝之谜", "莱珀妮", "希思黎", "娇兰",
# "阿玛尼", "科颜氏", "兰芝", "倩碧", "碧欧泉", "悦木之源", "芭比波朗", "汤姆福特", "YSL", "MAC", "欧莱雅", "雪肌精", "馥蕾诗", "玉兰油",
# "梦妆", "丸美", "Allie 防晒", "Sofina 防晒", "悦诗风吟", "碧欧泉", "自然堂", "百雀羚", "美宝莲", "伊蒂之屋", "施华蔻", "滋源", "丝蕴",
# "阿道夫", "吕 ryo", "黛丝恩", "芙丽芳丝", "碧柔", "妮维雅 洁面", "力士", "菲诗小铺", "安宝笛", "妮维雅", "曼秀雷敦", "高夫"]
#
# # content_all =["PK107","shiseido"]
# # content_all =["Lays Stax","乐事无限","乐事 桶","Calbee 麦片","卡乐比 麦片","上好佳 薯片","薯条"]
# length=len(word_lists)
# xclist = [] #构建协程链接池
# for j in range(length):
# xclist.append(gevent.spawn(main,word_lists[j]))
# print(xclist)
#
#
#
# gevent.joinall(xclist)