Thursday, November 22, 2018

Scrap all job data from github (https://jobs.github.com) and export in json format

import pdb
import json
import requests
import datetime
from bs4 import BeautifulSoup

page_link = 'https://jobs.github.com/positions?page=0'
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

response = []

for i in range(100):
print(i)
page_link = 'https://jobs.github.com/positions?page=' + str(i)
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

data = page_content.find('div', attrs={'id': 'page'})

if data.find('h1').text.strip() == 'Nothing found':
break
data = page_content.findAll('table', attrs={'class': 'positionlist'})

table_tr = data[0].findAll('tr')

for row in table_tr:
response_obj = {}
row_td = row.find('td', {'class': 'title'})
try:
job_title = row_td.find('h4').text
except:
break
company = row_td.find('a', {'class': 'company'}).text

if 'fulltime' in row_td.find('strong')['class']:
fulltime = row_td.find('strong', {'class': 'fulltime'}).text
elif 'parttime' in row_td.find('strong')['class']:
fulltime = row_td.find('strong', {'class': 'parttime'}).text
elif 'contract' in row_td.find('strong')['class']:
fulltime = row_td.find('strong', {'class': 'contract'}).text

row_td = row.find('td', {'class': 'meta'})
location = row_td.find('span', {'class': 'location'}).text
timezone = row_td.find('span', {'class': 'when'}).text
response_obj.update({'title': job_title,
'company': company,
'job_type': fulltime,
'location': location,
'timezone': timezone,
})
response.append(response_obj)

with open('github_response.json', 'w') as outfile:
json.dump(response, outfile)

No comments:

Post a Comment