movie/book feature clawing

JimSunJing · Jul 17, 2019 · 50c4f38 · 50c4f38
1 parent 3e6ddd7
commit 50c4f38
Show file tree

Hide file tree

Showing 3 changed files with 550 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -15,20 +15,16 @@ It is a very simple clawer in order to back-up one's DouBan account considering
 * DouBan Music Back-up
 * DouBan Broadcast clawer
 * DouBan Diary Back-up
-* Douban Critique Back-up
-
-## Coming Up
-
+* DouBan Critique Back-up
 * Movie label feature
-* Music label feature
 * Book label feature
 
-## Warning
+## Coming Up
 
-* 豆瓣阅读爬虫已经换成了requests.
+* DouBan Dou-List back-up
 
 
 ## About
 
-* 还有一周就考完啦！Fighting!
+* 有想法做一个独立的豆瓣书影音推荐系统，有兴趣一起做的可以联系我。
 * 立志学机器学习的地铁工结果做了一个py爬虫（？）
diff --git a/code/bookv2.py b/code/bookv2.py
@@ -0,0 +1,276 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+from time import sleep,perf_counter
+from random import uniform,choice
+user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
+                "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0",
+                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
+                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
+                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
+                "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
+                ]
+headers0 = {'User-Agent':user_agent_list[3]}
+
+
+def noco(txt):
+    return txt.replace(',','、').replace('，','、').replace('\n','  ')
+
+def timebar(scale,start,p):
+    a='※'*round(p*scale)
+    b='.'*(scale-round(p*scale))
+    dur=(perf_counter()-start)/60
+    print("\r{:^3.0f}%[{}->{}]已运行{:.2f}分钟".format(p*100,a,b,dur),end=' ')
+
+class Douban_Book:
+    def __init__(self,doubanid):
+        self.s=requests.Session()
+        #加上头部
+        self.s.headers.update(headers0)
+        self.id=doubanid
+        #wish dict format: {bookid:[书名,作者,译者,原作名,出版社,出版年,页数,ISBN,评分,评分人数]}
+        self.wish_dict={}
+        self.Keys=['书名','作者','译者','原作名',\
+                    '出版社','出版年','页数','ISBN','评分','评分人数']
+        #saw dict format: {bookid:[书名,作者,译者,出版社,出版年,页数,ISBN,评分,评分人数,用户评分,评论,标记日期]}
+        self.saw_dict={}
+
+    def Wish(self):
+        print('\n开始爬取'+self.id+'的想读列表')
+        beg=eval(input('请输入你要爬取的起始页码（比如1）：'))
+        end=eval(input('请输入终止页码（建议一次爬取10页以下）：'))
+        page=beg
+        firstpage='https://book.douban.com/people/'+self.id+\
+            '/wish?sort=time&start='+str((beg-1)*30)+'&filter=all&mode=list&tags_sort=count'
+        req=self.s.get(firstpage)
+        print(f'第{page}页',req.status_code)
+        soup=BeautifulSoup(req.text,'html.parser')
+        #get book name and id
+        wish=soup.find_all(class_='item')
+        for Item in wish:
+            name=Item(href=re.compile('subject'))[0].get_text(strip=True)
+            bid=Item.find(href=re.compile('subject')).get('href').split('/')[-2]
+            self.wish_dict[bid]={'书名':name,'作者':'','译者':'','原作名':'','出版社':'',\
+                                 '出版年':'','页数':'','ISBN':'','评分':'','评分人数':''}
+        #get all wish list
+        while 1:
+            sleep(uniform(1.5,4))
+            if page==end:
+                break
+            try:
+                NextPage='https://book.douban.com'+soup.find(class_='next').link.get('href')
+            except:
+                break
+            else:
+                req=self.s.get(NextPage)
+                page+=1
+                print(f'第{page}页',req.status_code)
+                soup=BeautifulSoup(req.text,'html.parser')
+                wish==soup.find_all(class_='item')
+                for Item in wish:
+                    name=Item(href=re.compile('subject'))[0].get_text(strip=True)
+                    bid=Item.find(href=re.compile('subject')).get('href').split('/')[-2]
+                    self.wish_dict[bid]={'书名':name,'作者':'','译者':'','原作名':'','出版社':'',\
+                                         '出版年':'','页数':'','ISBN':'','评分':'','评分人数':''}
+        #add feature for every book
+        count=0
+        st=perf_counter()
+        total=len(self.wish_dict)
+        fail=[]
+        for bid in self.wish_dict.keys():
+            count+=1
+            if count%50==0:
+                sleep(10)
+            sleep(uniform(1.5,4))
+            timebar(30,st,count/total)
+            fail.append(self.get_feature(bid,'wish'))
+        print('\n再次尝试打开失败的书籍页')
+        sleep(10)
+        for fbid in fail:
+            if fbid!=None:
+                sleep(2)
+                print()
+                self.get_feature(fbid,'wish')
+        return self.wish_dict
+
+
+    def get_feature(self,bid,ty):
+        if ty=='wish':
+            dic=self.wish_dict
+        elif ty=='saw':
+            dic=self.saw_dict
+        head='https://book.douban.com/subject/'
+        try:
+            req2=self.s.get(head+bid)
+            if req2.status_code!=requests.codes.ok:
+                print('\r打开书籍页失败，失败的书籍链接：'+head+bid)
+                self.switch_header()
+                return bid
+            print('  '+dic[bid]['书名'].center(20,':')+' 状态：',req2.status_code,end=' ')
+            if req2.status_code == requests.codes.ok:
+                soup2=BeautifulSoup(req2.text,'html.parser')
+                c=soup2.find(id='info').text.replace('\xa0','').replace('\n   ','')
+                intro=c.split('\n')
+                for i in intro:
+                    if ':' in i :
+                        i=i.replace(' ','')
+                        key,value=i.split(':',1)
+                        if key in self.Keys:
+                            dic[bid][key]=value
+                try:
+                    dic[bid]['评分']=soup2.find(property=re.compile('average')).text.strip(' ')
+                except:
+                    dic[bid]['评分']=''
+                try:
+                    dic[bid]['评分人数']=soup2.find(class_="rating_people").span.text.strip(' ')
+                except:
+                    dic[bid]['评分人数']='0'
+        except:
+            print('\r打开书籍页失败，失败的书籍链接：'+head+bid)
+            self.switch_header()
+            return bid
+
+    def saw_get(self,saw):
+        date=saw(class_=re.compile('date'))[0].get_text(strip=True)
+        try:
+            star=saw(class_=re.compile('rat'))[0]['class'][0][6]
+        except:
+            star=''
+        try:
+            comment=saw(class_=re.compile('comment'))[0].get_text(strip=True)
+        except:
+            comment=''
+        try:
+            owntag_list=saw.find(class_='tags').get_text(strip=True).split(': ',1)[1].split(' ')
+            owntag='/'.join(owntag_list)
+        except:
+            owntag=''
+        name=saw.find(href=re.compile('subject')).get_text(strip=True)
+        bid=saw.find(href=re.compile('subject')).get('href').split('/')[-2]
+        return date,star,comment,owntag,name,bid
+
+    def Saw(self):
+        print('\n开始爬取'+self.id+'的读过列表')
+        beg=eval(input('请输入你要爬取的起始页码（比如1）：'))
+        end=eval(input('请输入终止页码（建议一次爬取10页以下）：'))
+        page=beg
+        homepage='https://book.douban.com/people/'+self.id
+        tr=self.s.get(homepage)
+        Sfirstpage='https://book.douban.com/people/'+self.id+'/collect?&sort=time&start='+str((beg-1)*30)+'&filter=all&mode=list'
+        req=self.s.get(Sfirstpage)
+        soup=BeautifulSoup(req.text,'html.parser')
+        print(f'第{page}页',req.status_code)
+        #get book name and id
+        saw=soup.find_all(class_=['item'])
+        for i in range(len(saw)):
+            date,star,comment,owntag,name,bid=self.saw_get(saw[i])
+            self.saw_dict[bid]={'书名':name,'作者':'','译者':'','原作名':'','出版社':'',\
+                                '出版年':'','页数':'','ISBN':'','评分':'','评分人数':'',\
+                                '用户评分':star,'短评':comment,'用户标签':owntag,'标记日期':date}
+        #get all saw list
+        while 1:
+            sleep(uniform(1.5,4))
+            if page==end:
+                break
+            try:
+                NextPage='https://book.douban.com'+soup.find(class_='next').link.get('href')
+            except:
+                break
+            else:
+                req=self.s.get(NextPage)
+                soup=BeautifulSoup(req.text,'html.parser')
+                page+=1
+                print(f'第{page}页',req.status_code)
+                saw=soup.find_all(class_=['item'])
+                for i in range(len(saw)):
+                    date,star,comment,owntag,name,bid=self.saw_get(saw[i])
+                    self.saw_dict[bid]={'书名':name,'作者':'','译者':'','原作名':'','出版社':'',\
+                                        '出版年':'','页数':'','ISBN':'','评分':'','评分人数':'',\
+                                        '用户评分':star,'短评':comment,'用户标签':owntag,'标记日期':date}
+        #add feature for every book
+        count=0
+        st=perf_counter()
+        total=len(self.saw_dict)
+        fail=[]
+        for bid in self.saw_dict.keys():
+            count+=1
+            if count%50==0:
+                sleep(10)
+            sleep(uniform(1.5,4))
+            timebar(30,st,count/total)
+            fail.append(self.get_feature(bid,'saw'))
+        print('\n再次尝试打开失败的书籍页')
+        sleep(10)
+        for fbid in fail:
+            if fbid!=None:
+                sleep(2)
+                print()
+                self.get_feature(fbid,'saw')
+        return self.saw_dict
+
+
+    def save_as_csv(self,choice):
+        id=self.id
+        if choice in ['a','c']:
+            #保存想读
+            wish_dict=self.wish_dict
+            fw=open(id+'想读plus.csv','a',encoding='utf-8_sig')
+            fw.write('书名,作者,译者,原作名,出版社,出版年,页数,ISBN,评分,评分人数\n')
+            for bid in wish_dict.keys():
+                fw.write(noco(wish_dict[bid]['书名'])+','+wish_dict[bid]['作者']+\
+                        ','+noco(wish_dict[bid]['译者'])+','+noco(wish_dict[bid]['原作名'])+','+\
+                        noco(wish_dict[bid]['出版社'])+','+noco(wish_dict[bid]['出版年'])+','+\
+                        wish_dict[bid]['页数']+','+wish_dict[bid]['ISBN']+','+\
+                        wish_dict[bid]['评分']+','+wish_dict[bid]['评分人数']+'\n')
+            fw.close()
+        if choice in ['b','c']:
+            #保存读过
+            saw_dict=self.saw_dict
+            fw2=open(id+'读过plus.csv','a',encoding='utf-8_sig')
+            fw2.write('书名,作者,译者,原作名,出版社,出版年,页数,ISBN,评分,评分人数,用户评分,短评,用户标签,标记日期\n')
+            for bid in saw_dict.keys():
+                fw2.write(noco(saw_dict[bid]['书名'])+','+noco(saw_dict[bid]['作者'])+\
+                        ','+noco(saw_dict[bid]['译者'])+','+noco(saw_dict[bid]['原作名'])+','+\
+                        noco(saw_dict[bid]['出版社'])+','+noco(saw_dict[bid]['出版年'])+','+\
+                        saw_dict[bid]['页数']+','+saw_dict[bid]['ISBN']+','+\
+                        saw_dict[bid]['评分']+','+saw_dict[bid]['评分人数']+','+saw_dict[bid]['用户评分']+','+\
+                        noco(saw_dict[bid]['短评'])+','+noco(saw_dict[bid]['用户标签'])+\
+                        ','+saw_dict[bid]['标记日期']+'\n')
+            fw2.close()
+
+    def switch_header(self):
+        headers0['User-Agent']=choice(user_agent_list)
+        self.s.headers.update(headers0)
+
+def main():
+    print('嘿，据说你想要备份你的豆瓣书籍记录？')
+    print('''你需要知道：
+    1. 本程序是一个爬虫程序，在爬取书籍条目特征时会产生大量的网页访问，爬完后你的ip也许会被豆瓣封一段时间（登陆账号还是可以用啦）。
+    2. 大量的网页访问意味着需要大量的流量。
+    3. 爬取成功后，你的文件(csv)会被存储在该exe目录下，请不要在压缩包内使用该程序，解压后再使用。
+    4. 可能会比较耗时。''')
+    ans1=input('请确定你要开始备份(yes/no)： ')
+    if ans1=='yes':
+        Douid=input('请输入你的豆瓣id： ')
+        clawer=Douban_Book(doubanid=Douid)
+        print('''
+以下为选项
+    A：想读列表
+    B：读过列表
+    C：想读+读过''')
+        ans2=input('请输入你需要爬取的内容：')
+        ans2=ans2.lower()
+        if ans2=='a':
+            clawer.Wish()
+        elif ans2=='b':
+            clawer.Saw()
+        elif ans2=='c':
+            clawer.Wish()
+            clawer.Saw()
+        clawer.save_as_csv(choice=ans2)
+    print('\n问题反馈：[email protected] | https://github.com/JimSunJing/douban_clawer')
+
+
+main()
+sleep(10)
+over=input('按任意键退出')