import pdb
import json
import requests
import datetime
from bs4 import BeautifulSoup
page_link = 'https://jobs.github.com/positions?page=0'
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
response = []
for i in range(100):
print(i)
page_link = 'https://jobs.github.com/positions?page=' + str(i)
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
data = page_content.find('div', attrs={'id': 'page'})
if data.find('h1').text.strip() == 'Nothing found':
break
data = page_content.findAll('table', attrs={'class': 'positionlist'})
table_tr = data[0].findAll('tr')
for row in table_tr:
response_obj = {}
row_td = row.find('td', {'class': 'title'})
try:
job_title = row_td.find('h4').text
except:
break
company = row_td.find('a', {'class': 'company'}).text
if 'fulltime' in row_td.find('strong')['class']:
fulltime = row_td.find('strong', {'class': 'fulltime'}).text
elif 'parttime' in row_td.find('strong')['class']:
fulltime = row_td.find('strong', {'class': 'parttime'}).text
elif 'contract' in row_td.find('strong')['class']:
fulltime = row_td.find('strong', {'class': 'contract'}).text
row_td = row.find('td', {'class': 'meta'})
location = row_td.find('span', {'class': 'location'}).text
timezone = row_td.find('span', {'class': 'when'}).text
response_obj.update({'title': job_title,
'company': company,
'job_type': fulltime,
'location': location,
'timezone': timezone,
})
response.append(response_obj)
with open('github_response.json', 'w') as outfile:
json.dump(response, outfile)
No comments:
Post a Comment