第一个爬虫哈哈哈哈哈
代码
import requests
import re
content = requests.get('http://www.cnu.cc/discoveryPage/hot-0').text
pattern = re.compile(r'<a href="(.*?)".*?title">(.*?)</d.*?author">(.*?)</di.*?src="(.*?)"', re.S)
results = re.findall(pattern, content)
print(results)
for result in results:
url, name, author, ads = result
print(url, re.sub('\s', '', name), re.sub('\s', '', author), ads)