Thursday, November 22, 2018

Scrap data from Canadian Academy of Periodontology (http://www.cap-acp.ca) and export in Excel file.

import pdb
from string import ascii_lowercase

import requests

import xlsxwriter
from bs4 import BeautifulSoup

row = 0
col = 0

workbook = xlsxwriter.Workbook('users_data.xlsx')
worksheet = workbook.add_worksheet()

worksheet.write(row, col, 'Name')
worksheet.write(row, col + 1, 'Address')
worksheet.write(row, col + 2, 'Country')
worksheet.write(row, col + 3, 'PinCode')
worksheet.write(row, col + 4, 'Phone Number')
worksheet.write(row, col + 5, 'Fax')
worksheet.write(row, col + 6, 'Email ID')
worksheet.write(row, col + 7, 'Web Site')
row += 1

for ascii_char in ascii_lowercase:
try:
page_link = 'http://www.cap-acp.ca/en/search/index.php?page=4&search=' + ascii_char
page_response = requests.get(
page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

data = page_content.findAll('div', attrs={'class': 'main'})

for div in data:
links = div.findAll('a')
for a in links:
if a['href'] != 'index.php':
child_page_response = requests.get(
"http://www.cap-acp.ca" + a['href'], timeout=5555)
child_page_content = BeautifulSoup(
child_page_response.content, "html.parser")

new_data = child_page_content.findAll(
'div', attrs={'class': 'panel'})
new_data = new_data[0].text
user = new_data.split('\n')

name = user[2]
address = user[3]
country = user[4]
pin_code = user[5]
phone_number = user[6]
fax = ''
if 'Fax' in user[7]:
fax = user[7]
email = ''
if 'E-mail' in user[8]:
email = user[8]
website = ''
if 'Web site' in user[9]:
website = user[9]

worksheet.write(row, col, name)
worksheet.write(row, col + 1, address)
worksheet.write(row, col + 2, country)
worksheet.write(row, col + 3, pin_code)
worksheet.write(row, col + 4, phone_number)
worksheet.write(row, col + 5, fax)
worksheet.write(row, col + 6, email)
worksheet.write(row, col + 7, website)

row += 1
except:
pass

workbook.close()

Scrap all job data from github (https://jobs.github.com) and export in json format

import pdb
import json
import requests
import datetime
from bs4 import BeautifulSoup

page_link = 'https://jobs.github.com/positions?page=0'
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

response = []

for i in range(100):
print(i)
page_link = 'https://jobs.github.com/positions?page=' + str(i)
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

data = page_content.find('div', attrs={'id': 'page'})

if data.find('h1').text.strip() == 'Nothing found':
break
data = page_content.findAll('table', attrs={'class': 'positionlist'})

table_tr = data[0].findAll('tr')

for row in table_tr:
response_obj = {}
row_td = row.find('td', {'class': 'title'})
try:
job_title = row_td.find('h4').text
except:
break
company = row_td.find('a', {'class': 'company'}).text

if 'fulltime' in row_td.find('strong')['class']:
fulltime = row_td.find('strong', {'class': 'fulltime'}).text
elif 'parttime' in row_td.find('strong')['class']:
fulltime = row_td.find('strong', {'class': 'parttime'}).text
elif 'contract' in row_td.find('strong')['class']:
fulltime = row_td.find('strong', {'class': 'contract'}).text

row_td = row.find('td', {'class': 'meta'})
location = row_td.find('span', {'class': 'location'}).text
timezone = row_td.find('span', {'class': 'when'}).text
response_obj.update({'title': job_title,
'company': company,
'job_type': fulltime,
'location': location,
'timezone': timezone,
})
response.append(response_obj)

with open('github_response.json', 'w') as outfile:
json.dump(response, outfile)

Python Code : Convert Image to text

'''
image to text converter....
sudo apt install tesseract-ocr
pip install Pillow
pip intall pytesseract
'''

from PIL import Image
from pytesseract import image_to_string

print(image_to_string(Image.open('ocr_scrap.jpg'), lang='eng'))

Scrap Indeed job lists data and save in Excel file

import pdb
import json
import requests
import datetime
import xlsxwriter
from bs4 import BeautifulSoup



def save_job_details(category, job_list, row, col):
job_company = category.find('span', {'class': 'icl-NavigationList-primaryText '}).text
for job in job_list:
job_name = job.find('h2').text.strip()
company_name = job.find('span', {'class':'company'}).text.strip()
location = job.find('span', {'class':'location'}).text.strip()
date = job.find('span', {'class':'date'}).text.strip()

detail_page_link = 'https://www.indeed.co.uk' + job.find('a', {'class':'turnstileLink'})['href']
detail_page_response = requests.get(detail_page_link, timeout=9999)
detail_page_content = BeautifulSoup(detail_page_response.content, "html.parser")
description = detail_page_content.find('div', attrs={'class': 'jobsearch-JobComponent-description icl-u-xs-mt--md'}).text

worksheet.write(row, col, job_company)
worksheet.write(row, col + 1, job_name)
worksheet.write(row, col + 2, company_name)
worksheet.write(row, col + 3, location)
worksheet.write(row, col + 4, description)
worksheet.write(row, col + 5, date)
worksheet.write(row, col + 6, detail_page_link)
row += 1
page_link = 'https://www.indeed.co.uk/?sq=1'
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

workbook = xlsxwriter.Workbook('jobs_list.xlsx')
worksheet = workbook.add_worksheet()

row = 0
col = 0

worksheet.write(row, col, 'Job Category')
worksheet.write(row, col + 1, 'Job Name')
worksheet.write(row, col + 2, 'Company Name')
worksheet.write(row, col + 3, 'Location')
worksheet.write(row, col + 4, 'Description')
worksheet.write(row, col + 5, 'Date')
worksheet.write(row, col + 6, 'Job Detail Link')
row += 1

categories = page_content.findAll('li', attrs={'class': 'icl-NavigationList-item'})

for category in categories:
page_link = 'https://www.indeed.co.uk' + category.find('a')['href']
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
job_list = page_content.findAll('div', {'class': 'jobsearch-SerpJobCard row result'})
save_job_details(category, job_list, row, col)
check_next = page_content.find('div', {'class':'pagination'})
next_page = check_next.findAll('a')[-1]
check_span = next_page.find('span')
if 'pn' in check_span['class']:
page_link = 'https://www.indeed.co.uk' + next_page['href']
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
job_list = page_content.findAll('div', {'class': 'jobsearch-SerpJobCard row result'})
save_job_details(category, job_list, row, col)
workbook.close()

Scrap Instagram all users profile data

import pdb
import json
import requests
import datetime
from bs4 import BeautifulSoup
import xlsxwriter
import time

row1 = 0
row = 0
col = 0

workbook = xlsxwriter.Workbook('instagram_users_data.xlsx')
worksheet = workbook.add_worksheet()

worksheet.write(row, col, 'Name')
worksheet.write(row, col + 1, 'Handle')
worksheet.write(row, col + 2, 'Email')
worksheet.write(row, col + 3, 'Category')
worksheet.write(row, col + 4, 'No Of Followers')
worksheet.write(row, col + 5, 'Average Likes')
worksheet.write(row, col + 6, 'Total Posts')

workbook2 = xlsxwriter.Workbook('instagram_all_users_data.xlsx')
worksheet2 = workbook2.add_worksheet()

worksheet2.write(row1, col, 'Name')
worksheet2.write(row1, col + 1, 'Handle')
worksheet2.write(row1, col + 2, 'Email')
worksheet2.write(row1, col + 3, 'Phone Number')
worksheet2.write(row1, col + 4, 'Category')
worksheet2.write(row1, col + 5, 'No Of Followers')
worksheet2.write(row1, col + 6, 'No Of Following')
worksheet2.write(row1, col + 7, 'Average Likes')
worksheet2.write(row1, col + 8, 'Total Posts')
worksheet2.write(row1, col + 9, 'Profile Url')

row += 1
row1 += 1


for i in range(100):
for j in range(10):
try:
print(j)
list_link = 'https://www.instagram.com/directory/profiles/'+ str(i) + '-' +str(j)+ '/'
list_response = requests.get(list_link, timeout=9999)
list_content = BeautifulSoup(list_response.content, "html.parser")

script_list = list_content.findAll('script')
script_list = script_list[3].text
script_list = script_list.replace(';', '')
script_list = script_list.replace('window._sharedData = ', '')
all_data = json.loads(script_list)
all_data = all_data['entry_data']['ProfilesDirectoryPage'][0]['profile_data']['profile_list']
all_data = json.loads(all_data)

for data in all_data:
try:
page_link = 'https://www.instagram.com/' + str(data) + '/'
print(page_link)
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

script_tag = page_content.findAll('script')
script_tag = script_tag[3].text
clean_script = script_tag.replace(';', '')
clean_script = clean_script.replace('window._sharedData = ', '')

try:
json_data = json.loads(clean_script)
no_of_followers = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_followed_by']['count']
is_business_account = phone_number = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['is_business_account']

if is_business_account and no_of_followers > 250 and no_of_followers < 20000 and total_posts > 15:
total_posts = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count']
name = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['full_name']
handle = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['username']
email = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['business_email']
phone_number = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['business_phone_number']
category = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['business_category_name']
no_of_following = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_follow']['count']
average_likes_per_posts = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][0]['node']['edge_liked_by']['count']
average_comments_per_posts = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][0]['node']['edge_media_to_comment']['count']
worksheet.write(row, col, name)
worksheet.write(row, col + 1, handle)
worksheet.write(row, col + 2, email)
worksheet.write(row, col + 3, category)
worksheet.write(row, col + 4, no_of_followers)
worksheet.write(row, col + 5, average_likes_per_posts)
worksheet.write(row, col + 6, total_posts)
row += 1
worksheet2.write(row1, col, name)
worksheet2.write(row1, col + 1, handle)
worksheet2.write(row1, col + 2, email)
worksheet2.write(row1, col + 3, phone_number)
worksheet2.write(row1, col + 4, category)
worksheet2.write(row1, col + 5, no_of_followers)
worksheet2.write(row1, col + 6, no_of_following)
worksheet2.write(row1, col + 7, average_likes_per_posts)
worksheet2.write(row1, col + 8, total_posts)
worksheet2.write(row1, col + 9, page_link)
row1 += 1
except:
pass
except:
time.sleep(200)
pass
except:
time.sleep(400)
pass
workbook.close()

How to scrap all job list from https://stackoverflow.com/jobs

'''
 Scrap Stackoverflow all jobs and save response in a file in json format
'''

import json
import requests
import datetime
from bs4 import BeautifulSoup

page_link = 'https://stackoverflow.com/jobs?sort=p'
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

data = page_content.findAll('a', attrs={'class': 'job-link'})[-2]
loop_count = data.find('span').text
response = []

loop_count = 1
for i in range(int(loop_count)):
page_link = 'https://stackoverflow.com/jobs?sort=p&pg='+ str(i)
page_response = requests.get(page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")
data = page_content.findAll('a', attrs={'class': 's-link__visited'})
for d in data:
job_page_link = 'https://stackoverflow.com' + d['href']
response_obj = {}

job_page_response = requests.get(job_page_link, timeout=9999)

job_page_content = BeautifulSoup(
job_page_response.content, "html.parser")

job_header = job_page_content.findAll('header', {'class': 'job-details--header'})
job_header = job_header[0]
# ----- Job Details ---- #
company_logo = job_header.find('div', {'class': 's-avatar'})
company_logo = company_logo.find('img')
company_logo = company_logo['src']
job_title = job_header.find('h1', {'class': 'fs-headline1'})
job_title = job_title.find('a').text

company_info = job_page_content.find(
'div', attrs={'class': 'fc-black-700 fs-body3'}).text
company_info = company_info.split('\n')

salary = ''
location_remote = ''
location_visa = ''
location_relocation = ''
try:
company_obj_list = job_header.find('div', {'class': 'mt12'})
company_obj_list = company_obj_list.findAll('span')

for company_obj in company_obj_list:
if '-salary' in company_obj['class']:
salary = company_obj.text.strip()
if '-remote' in company_obj['class']:
location_remote = company_obj.text.strip()
if '-visa' in company_obj['class']:
location_visa = company_obj.text.strip()
if '-relocation' in company_obj['class']:
location_relocation = company_obj.text.strip()
except:
pass
date_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
response_obj.update({'timestamp': date_time,
'job_url': job_page_link,
'company_logo': company_logo,
'job_title': job_title,
'company_name': company_info[1],
'company_location': company_info[3],
'salary': salary,
'remote' : location_remote,
'visa' : location_visa,
'relocation': location_relocation})

job_data = job_page_content.find('div', attrs={'id': 'overview-items'})
sections = job_data.findAll('section', {'class': 'mb32'})

# ---- Section One About this job -------- #
section_1 = sections[0]
about_job = section_1.findAll('div', {'class': 'mb8'})

job_headings = section_1.find('h2', {'class': 'fs-subheading'}).text

for ajob in about_job:
headings = ajob.findAll('span')
label = headings[0].text.lstrip().rstrip().replace(':', '').replace(' ', '_').lower()
values = headings[1].text
response_obj.update({label: values})

# ------ Section Two Technologies ----- #
section_2 = sections[1]
job_headings = section_2.find('h2', {'class': 'fs-subheading'}).text

technologies = section_2.findAll('a', {'class': 'job-link'})
tech_stack = ''
for tech in technologies:
tech_stack = tech_stack + tech.text + ', '
response_obj.update({'technologies': tech_stack })
# ------ Section Three Job description ----- #
section_3 = sections[2]

job_headings = section_3.find('h2', {'class': 'fs-subheading'}).text

job_description = section_3.find('div').text

response_obj.update({'job_description': job_description})

response.append(response_obj)

with open('response.json', 'w') as outfile:
json.dump(response, outfile)