In [ ]:
import re

import requests
from bs4 import BeautifulSoup

#first inspecting what we're dealing with on the landing page

with requests.Session() as session:
    # extract the last page
    response = session.get("https://rentinsingapore.com.sg/rooms-for-rent")    
    soup = BeautifulSoup(response.content, "html.parser")
    #last_page = int(re.search("page=(\d+)", soup.select_one("li.pager-last").a["href"]).group(1))
    print(soup.prettify())
In [ ]:
#get all the individual room links

room_links = []

base_url = "https://rentinsingapore.com.sg/rooms-for-rent"

# request page 1 through 10
n = 546
for i in range(1, n+1):
    print(i)
    if (i == 1):
        # handle first page
        response = session.get(base_url)
    else:
        response = session.get(base_url + "/page-%d" % i)
  
    soup = BeautifulSoup(response.content, "html.parser")

    rooms = soup.find_all('div', class_='room__wide listing-container')
    for room in rooms:
        hello = room.find_all('a')
        for link in hello:
            room_links.append(link.get('href'))
        
print(room_links)
In [ ]:
#go to each of the individual room links and extract elements you're interested in

import pandas as pd

base_url = "https://rentinsingapore.com.sg/"

title = []
location_street = []
location_district = []
location_area = []
price = []
room_details = []
desc = []
lat = []
long = []

# request page 1 through 10

for i in room_links:
    #print(i)
    print(len(desc))
    response = session.get(base_url + "%s" % i)
    soup = BeautifulSoup(response.content, "html.parser")

    #collect h1 title
    element = soup.find('div', class_='pull-left')
    try:
        title_str = element.find('h1')
        title.append(title_str.string)
    except:
        title.append("N/A")

    #collect location from content elements in location element
    element = soup.find('div', class_='room-location')
    try:
        location_street.append(element.contents[0])
    except:
        location_street.append("N/A")
    try:
        location_district.append(element.contents[2])
    except:
        location_district.append("N/A")
    try:
        location_area.append(element.contents[4])
    except: 
        location_area.append('N/A')

    #collect price - only first element needed
    #element = soup.find('div', class_='pull-right pull-right__price')
    element = soup.find('div', class_='room-price')
    try:
        price.append(element.contents[0])
    except: price.append('')

    #collect details in a list - seperate them later
    element = soup.find('div', class_="room-details container-show-details")
    details = []
    try:
        element2 = element.find_all('li')
        details = []
        for i in element2:
            details.append(i.text)
        room_details.append(details)
        #print(details)
    except: 
        room_details.append('N/A')

    #collect desc
    try:
        descript = soup.find("p", {"class":"room-description container-show-description"})['description-text']
        desc.append(descript)
    except:
        desc.append('N/A')

    #collect lat
    try:
        latitude = soup.find("input", {"class":"js-gmap-lat"})['value']
    #if latitude != None:
        lat.append(latitude)
    except: 
        lat.append('N/A')

    #collect long
    try:
        longitude = soup.find("input", {"class":"js-gmap-lng"})['value']
        long.append(longitude)
    except: 
        long.append('N/A')

        
    
rooms = pd.DataFrame(
    {'title': title,
     'location_area': location_area,
     'location_district': location_district,
     'price': price,
     'room_details': room_details,
     'desc': desc,
     'lat': lat,
     'long': long
    })    

rooms.to_csv("name2.csv", encoding='utf-8')
In [ ]:
#ideally, you have some kind of try except in case the internet connection runs out, but if you forget like me
#you can also just get the length of the list at the moment of crashing and then iterate through the room_links list from there onwards
#but make sure not to re-define the lists as empty

for i in room_links[2999:]:
In [ ]:
import pandas as pd
df = pd.read_csv('rooms_sing_4.csv', encoding='utf-8', names=['title', 'desc', 'lat', 'long', 'price', 'room_details', 'location_district', 'location_area'])
df = df.reset_index(drop=True)
df.drop(df.head(1).index, inplace=True)
df
In [ ]:
import pandas as pd
df = pd.read_csv('rooms_sing_4.csv', encoding='utf-8') #, names=['title', 'desc', 'lat', 'long', 'price', 'room_details', 'location_district', 'location_area'])
df = df.reset_index(drop=True)
#df.drop(df.head(1).index, inplace=True)
df
In [ ]:
room_links
In [ ]:
room_details
In [ ]:
desc
In [ ]:
element2
In [ ]: