import re
import requests
from bs4 import BeautifulSoup
#first inspecting what we're dealing with on the landing page
with requests.Session() as session:
# extract the last page
response = session.get("https://rentinsingapore.com.sg/rooms-for-rent")
soup = BeautifulSoup(response.content, "html.parser")
#last_page = int(re.search("page=(\d+)", soup.select_one("li.pager-last").a["href"]).group(1))
print(soup.prettify())
#get all the individual room links
room_links = []
base_url = "https://rentinsingapore.com.sg/rooms-for-rent"
# request page 1 through 10
n = 546
for i in range(1, n+1):
print(i)
if (i == 1):
# handle first page
response = session.get(base_url)
else:
response = session.get(base_url + "/page-%d" % i)
soup = BeautifulSoup(response.content, "html.parser")
rooms = soup.find_all('div', class_='room__wide listing-container')
for room in rooms:
hello = room.find_all('a')
for link in hello:
room_links.append(link.get('href'))
print(room_links)
#go to each of the individual room links and extract elements you're interested in
import pandas as pd
base_url = "https://rentinsingapore.com.sg/"
title = []
location_street = []
location_district = []
location_area = []
price = []
room_details = []
desc = []
lat = []
long = []
# request page 1 through 10
for i in room_links:
#print(i)
print(len(desc))
response = session.get(base_url + "%s" % i)
soup = BeautifulSoup(response.content, "html.parser")
#collect h1 title
element = soup.find('div', class_='pull-left')
try:
title_str = element.find('h1')
title.append(title_str.string)
except:
title.append("N/A")
#collect location from content elements in location element
element = soup.find('div', class_='room-location')
try:
location_street.append(element.contents[0])
except:
location_street.append("N/A")
try:
location_district.append(element.contents[2])
except:
location_district.append("N/A")
try:
location_area.append(element.contents[4])
except:
location_area.append('N/A')
#collect price - only first element needed
#element = soup.find('div', class_='pull-right pull-right__price')
element = soup.find('div', class_='room-price')
try:
price.append(element.contents[0])
except: price.append('')
#collect details in a list - seperate them later
element = soup.find('div', class_="room-details container-show-details")
details = []
try:
element2 = element.find_all('li')
details = []
for i in element2:
details.append(i.text)
room_details.append(details)
#print(details)
except:
room_details.append('N/A')
#collect desc
try:
descript = soup.find("p", {"class":"room-description container-show-description"})['description-text']
desc.append(descript)
except:
desc.append('N/A')
#collect lat
try:
latitude = soup.find("input", {"class":"js-gmap-lat"})['value']
#if latitude != None:
lat.append(latitude)
except:
lat.append('N/A')
#collect long
try:
longitude = soup.find("input", {"class":"js-gmap-lng"})['value']
long.append(longitude)
except:
long.append('N/A')
rooms = pd.DataFrame(
{'title': title,
'location_area': location_area,
'location_district': location_district,
'price': price,
'room_details': room_details,
'desc': desc,
'lat': lat,
'long': long
})
rooms.to_csv("name2.csv", encoding='utf-8')
#ideally, you have some kind of try except in case the internet connection runs out, but if you forget like me
#you can also just get the length of the list at the moment of crashing and then iterate through the room_links list from there onwards
#but make sure not to re-define the lists as empty
for i in room_links[2999:]:
import pandas as pd
df = pd.read_csv('rooms_sing_4.csv', encoding='utf-8', names=['title', 'desc', 'lat', 'long', 'price', 'room_details', 'location_district', 'location_area'])
df = df.reset_index(drop=True)
df.drop(df.head(1).index, inplace=True)
df
import pandas as pd
df = pd.read_csv('rooms_sing_4.csv', encoding='utf-8') #, names=['title', 'desc', 'lat', 'long', 'price', 'room_details', 'location_district', 'location_area'])
df = df.reset_index(drop=True)
#df.drop(df.head(1).index, inplace=True)
df
room_links
room_details
desc
element2