Thursday, November 22, 2018

Scrap Indeed job lists data and save in Excel file

import pdb
import json
import requests
import datetime
import xlsxwriter
from bs4 import BeautifulSoup



def save_job_details(category, job_list, row, col):
job_company = category.find('span', {'class': 'icl-NavigationList-primaryText '}).text
for job in job_list:
job_name = job.find('h2').text.strip()
company_name = job.find('span', {'class':'company'}).text.strip()
location = job.find('span', {'class':'location'}).text.strip()
date = job.find('span', {'class':'date'}).text.strip()

detail_page_link = 'https://www.indeed.co.uk' + job.find('a', {'class':'turnstileLink'})['href']
detail_page_response = requests.get(detail_page_link, timeout=9999)
detail_page_content = BeautifulSoup(detail_page_response.content, "html.parser")
description = detail_page_content.find('div', attrs={'class': 'jobsearch-JobComponent-description icl-u-xs-mt--md'}).text

worksheet.write(row, col, job_company)
worksheet.write(row, col + 1, job_name)
worksheet.write(row, col + 2, company_name)
worksheet.write(row, col + 3, location)
worksheet.write(row, col + 4, description)
worksheet.write(row, col + 5, date)
worksheet.write(row, col + 6, detail_page_link)
row += 1
page_link = 'https://www.indeed.co.uk/?sq=1'
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

workbook = xlsxwriter.Workbook('jobs_list.xlsx')
worksheet = workbook.add_worksheet()

row = 0
col = 0

worksheet.write(row, col, 'Job Category')
worksheet.write(row, col + 1, 'Job Name')
worksheet.write(row, col + 2, 'Company Name')
worksheet.write(row, col + 3, 'Location')
worksheet.write(row, col + 4, 'Description')
worksheet.write(row, col + 5, 'Date')
worksheet.write(row, col + 6, 'Job Detail Link')
row += 1

categories = page_content.findAll('li', attrs={'class': 'icl-NavigationList-item'})

for category in categories:
page_link = 'https://www.indeed.co.uk' + category.find('a')['href']
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
job_list = page_content.findAll('div', {'class': 'jobsearch-SerpJobCard row result'})
save_job_details(category, job_list, row, col)
check_next = page_content.find('div', {'class':'pagination'})
next_page = check_next.findAll('a')[-1]
check_span = next_page.find('span')
if 'pn' in check_span['class']:
page_link = 'https://www.indeed.co.uk' + next_page['href']
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
job_list = page_content.findAll('div', {'class': 'jobsearch-SerpJobCard row result'})
save_job_details(category, job_list, row, col)
workbook.close()

No comments:

Post a Comment