scrape.py

import urllib3
from bs4 import BeautifulSoup
import re, pickle, urllib3
from fractions import Fraction
import threading, sys
from timeit import default_timer as timer

#####################################################################################################################
#####################################################################################################################
##############################					Definitions  						#################################
#####################################################################################################################
#####################################################################################################################

# Define Resource class which will store 3 maps, a name, and 2 anti virus ratings
class Resource():
	def __init__(self):
		self.resources = []
		self.name = str
		self.link = str
		self.trojan = 0
		self.avl = 0   # label depending on threshold
		self.avr = 0.0 # Ratio of malware labels positive

class myThread (threading.Thread):
	def __init__(self, threadID, counter,r,select):
		threading.Thread.__init__(self)
		self.threadID = threadID
		self.counter = counter*r+1
		self.r = r
		self.select = select
	def run(self):
		if self.select == 0:
			Scrape_Name(self.name, self.counter, self.r)
		if self.select == 1:
			Scrape_Resc(self.name, self.counter, self.r)
		
malware_dict = {} 	# Dictionary of link to malware using md5 hash
links = [] 			# List of every link

# Set Arguments to command line or default
try:
	test = int(sys.argv[1])		# command line argument setting whether program executes
except:
	test = 0
try:
	pages = int(sys.argv[2])	# otherwise use commandline arguement 2
except:
	pages = 1

# Disable Unverified HTTPS Request
urllib3.disable_warnings()

#####################################################################################################################
#####################################################################################################################
##############################					Functions    						#################################
#####################################################################################################################
#####################################################################################################################


# Scrape links of resources
def Scrape_Name(threadName, counter, r):
	for w in range (counter,counter+r):
		if w > pages:
			sys.exit(0)
		
		http = urllib3.PoolManager()
		index_url = "https://malwr.com/analysis/?page="+str(w)  # change to whatever your url is
		index_page = http.urlopen('GET',index_url,preload_content=False).read()
		index_soup = BeautifulSoup(index_page,'lxml')
		href_table = index_soup.find('table', attrs={'class':'table table-striped'})

		try: 
			href_rows = href_table.find_all('tr')
		except:
			print "https://malwr.com/analysis/?page="+str(w)
			print "Request timed out."
			continue
		print len(href_rows)
		for row in href_rows:
			md5 = ""
			cols = row.find_all('td')
			mal = Resource()
			for ele in cols:
				a = re.search(r'(?<=href=\").*(?=\"><)',str(ele))
				if a is not None:
				   links.append(a.group())
				if cols.index(ele) == 1:
					md5 = ele.text
				if cols.index(ele) == 4:
					try:
						mal.avr = Fraction(ele.text)
						mal.avl = int(ele.text.split("/")[0])
					except:
						mal.avr = -1
						mal.avl = -1
			malware_dict[md5] = mal 			
			
# Scrape name of resources 
def Scrape_Resc(threadName, counter, r):
	for q in range(counter,counter+r):
		if q > (len(links)-1):
			sys.exit(0)
		http = urllib3.PoolManager()
		url = "https://malwr.com"+links[q]  # change to whatever your url is
		page = http.urlopen('GET',url,preload_content=False).read()

		soup = BeautifulSoup(page,'lxml')

		table = soup.find_all('table', attrs={'class':'table table-striped'})
		table = table[1]
		rows = table.find_all('tr')
		for row in rows:
			cols = row.find_all("td")
			'''if rows.index(row)== 2:
				print row.find('td')'''
			if rows.index(row)== 3:
				md5 = row.find('td').text
				break
		
		malware = malware_dict[md5]
		malware.link = "https://malwr.com"+links[q]
		
		table = soup.findAll('div', {"class":"well mono"}) 

		files = []
		regs = []
		mutex = []
		resources = [files,regs,mutex]

		k = 0
		for i in table:
			for j in i:
				try:
					resources[k].append(j.lstrip())
				except:
					pass
			k+=1

		table = soup.find_all('table', attrs={'class':'table table-striped table-bordered'})
		av_pres = 0
		for i in table:
			title = i.find('tr').find('th')
			if title.text == "Antivirus":
				av_pres = 1
				t_rows = i.find_all('tr')
				virus_strings = []
				vendor = "temp"
				for row in t_rows:
					cols = row.find_all('td')
					if len(cols)>0:
						it = iter(cols)
						vendor = next(it).text
						virus_name = next(it).text
						if ( virus_name == "\nClean\n"):
							pass
						else:
							virus_strings.append(virus_name)
				g = open("Virus_Label/"+md5,"w")
				for j in virus_strings:
					if "troj" in j.lower():
						malware.trojan = 1
						f = open("Virus_Label/Trojan.txt","a").write(md5+"\n")
						break
					g.write(j.lower()+"\n")
				
				try:
					f.close()
					g.close()
				except:
					pass
				break
		if av_pres == 0:
			g = open("Virus_Label/"+md5,"w")
			g.write("NO AV")
			g.close()
		
		try: 
			malware.resources = resources
			malware.name = md5
			pickle.dump( malware, open( "Data/file"+str(q), "wb" ) )
		except:
			pass

#####################################################################################################################
#####################################################################################################################
##############################					Main Execution  						#################################
#####################################################################################################################
#####################################################################################################################
			
			
if test == 1:
	######################################################################################################
										# Multithread scraping links for resources
	threadLock = threading.Lock()
	threads = []
	if pages < 11:
		a = pages
		b = 1
	else:
		a = 10
		b = pages/11 + 1
	for i in range (1,a+1):
		threads.append(myThread(i,(i-1),b,0))
		threads[i-1].start()

	for t in threads:
		t.join()
	print len(links)
	###################################################################################################	
										# Multithread scraping name of resources
	entries = pages * 50
	threadLock = threading.Lock()
	threads = []
	if entries < 11:
		a = entries
		b = 1
	else:
		a = 10
		b = entries/10+1
		
	for i in range (1,a+1):
		threads.append(myThread(i,(i-1),b,1))
		threads[i-1].start()

	for t in threads:
		t.join()
	
	pickle.dump( malware_dict, open( "malware_dict", "wb" ) )
	pickle.dump( links, open( "links", "wb" ) )