import pdb
import json
import requests
import datetime
from bs4 import BeautifulSoup
page_link = 'https://jobs.github.com/positions?page=0'
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
response = []
for i in range(100):
    print(i)
    page_link = 'https://jobs.github.com/positions?page=' + str(i)
    page_response = requests.get(page_link, timeout=9999)
    page_content = BeautifulSoup(page_response.content, "html.parser")
    data = page_content.find('div', attrs={'id': 'page'})
    if data.find('h1').text.strip() == 'Nothing found':
        break
    data = page_content.findAll('table', attrs={'class': 'positionlist'})
    table_tr = data[0].findAll('tr')
    for row in table_tr:
        response_obj = {}
        row_td = row.find('td', {'class': 'title'})
        try:
            job_title = row_td.find('h4').text
        except:
            break
        company = row_td.find('a', {'class': 'company'}).text
        if 'fulltime' in row_td.find('strong')['class']:
            fulltime = row_td.find('strong', {'class': 'fulltime'}).text
        elif 'parttime' in row_td.find('strong')['class']:
            fulltime = row_td.find('strong', {'class': 'parttime'}).text
        elif 'contract' in row_td.find('strong')['class']:
            fulltime = row_td.find('strong', {'class': 'contract'}).text
        row_td = row.find('td', {'class': 'meta'})
        location = row_td.find('span', {'class': 'location'}).text
        timezone = row_td.find('span', {'class': 'when'}).text
        response_obj.update({'title': job_title,
                            'company': company,
                            'job_type': fulltime,
                            'location': location,
                            'timezone': timezone,
                            })
        response.append(response_obj)
with open('github_response.json', 'w') as outfile:
    json.dump(response, outfile) 
No comments:
Post a Comment