# -*- coding: utf-8 -*-
import time
import lxml
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
'Cookie': 'gr_user_id = c6f58a39 - ea25 - 4f58 - b448 - 545070192c4e;59a81cc7d8c04307ba183d331c373ef6_gr_session_id = e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26;59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_sid_with_cs1 = e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26;59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_cs1 = N % 2FA;59a81cc7d8c04307ba183d331c373ef6_gr_session_id_e8e4b66f - 440a - 4ae7 - a76a - fe2dd2b34a26 = true;grwng_uid = 9ec14ad9 - 5ac0 - 4bb1 - 81c1 - bc60d2685710;abtest_ABTest4SearchDate = b;xzuuid = 79426b52;_uab_collina = 154660443606130958890473;TY_SESSION_ID = 907f32df - c060 - 49ca - b945 - 98215cc03475;rule_math = pvzq3r06hi'}
def get_links(url):
#dc = {}
web_data = requests.get(url, headers = headers)
soup = BeautifulSoup(web_data.text, 'lxml')
#print(web_data.text)
links = soup.select('#page_list > ul > li > a')
for link in links:
href = link.get('href')
get_info(href)
print(href)
#a.append(href)
#return str(dc)
def judgment_sex(class_name):
if class_name == ['member_girl_ico']:
return '女'
elif class_name == ['member_boy_ico']:
return '男'
def get_info(url):
#data = {}
wb_data = requests.get(url,headers = headers)
soup = BeautifulSoup(wb_data.text,'html.parser') #html.parser
tittles = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h5 > em')
addresses = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
prices = soup.select('#pricePart > div.day_l > span')
imgs = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
names = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h7 > a')
sexs = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h7 > span')
'''
print(tittles)
print(addresses)
print(prices)
print(imgs)
print(names)
print(sexs)
#print(prices.get_text())
for price,name in zip(prices,names):
print(price.get_text())
print(name.get_text())
#直接 print(prices.get_text()) 不写for报错
for price in prices:
print(type(price))
print(type(prices))
'''
for tittle, address, price, img, name, sex in zip(tittles,addresses,prices,imgs,names,sexs):
#print('ssk')
data = {
'tittle':tittle.get_text().strip(),
'address':address.get_text().strip(),
'price':price.get_text().strip(),
'img':img.get("src"),
'name':name.get_text().strip(),
'sex':judgment_sex(sex.get("class"))
}
print(data)
#return data
if __name__ == '__main__':
urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(number) for number in range(1, 14)]
for url in urls:
get_links(url)
print("------------------这里是1页分割线-----------------------")
time.sleep(1)
#f = open(r'C:\Users\PC\Desktop\file1.txt','a+', encoding="utf-8")
#f.write(a)
#f.close()
#get_info('http://bj.xiaozhu.com/fangzi/29762014101.html')
本文标题:爬取小猪短租网信息
URL地址:
http://cxhlcq.com/article/ihiejp.html