Entertainment Daily Dose

Thursday, November 22, 2018

Scrap data from Canadian Academy of Periodontology (http://www.cap-acp.ca) and export in Excel file.

import pdb

from string import ascii_lowercase

import requests

import xlsxwriter

from bs4 import BeautifulSoup

row = 0

col = 0

workbook = xlsxwriter.Workbook('users_data.xlsx')

worksheet = workbook.add_worksheet()

worksheet.write(row, col, 'Name')

worksheet.write(row, col + 1, 'Address')

worksheet.write(row, col + 2, 'Country')

worksheet.write(row, col + 3, 'PinCode')

worksheet.write(row, col + 4, 'Phone Number')

worksheet.write(row, col + 5, 'Fax')

worksheet.write(row, col + 6, 'Email ID')

worksheet.write(row, col + 7, 'Web Site')

row += 1

for ascii_char in ascii_lowercase:

    try:

        page_link = 'http://www.cap-acp.ca/en/search/index.php?page=4&search=' + ascii_char

        page_response = requests.get(

            page_link, timeout=9999)

        page_content = BeautifulSoup(page_response.content, "html.parser")

        data = page_content.findAll('div', attrs={'class': 'main'})

        for div in data:

            links = div.findAll('a')

            for a in links:

                if a['href'] != 'index.php':

                    child_page_response = requests.get(

                        "http://www.cap-acp.ca" + a['href'], timeout=5555)

                    child_page_content = BeautifulSoup(

                        child_page_response.content, "html.parser")

                    new_data = child_page_content.findAll(

                        'div', attrs={'class': 'panel'})

                    new_data = new_data[0].text

                    user = new_data.split('\n')

                    name = user[2]

                    address = user[3]

                    country = user[4]

                    pin_code = user[5]

                    phone_number = user[6]

                    fax = ''

                    if 'Fax' in user[7]:

                        fax = user[7]

                    email = ''

                    if 'E-mail' in user[8]:

                        email = user[8]

                    website = ''

                    if 'Web site' in user[9]:

                        website = user[9]

                    worksheet.write(row, col, name)

                    worksheet.write(row, col + 1, address)

                    worksheet.write(row, col + 2, country)

                    worksheet.write(row, col + 3, pin_code)

                    worksheet.write(row, col + 4, phone_number)

                    worksheet.write(row, col + 5, fax)

                    worksheet.write(row, col + 6, email)

                    worksheet.write(row, col + 7, website)

                    row += 1

    except:

        pass

workbook.close()

Scrap all job data from github (https://jobs.github.com) and export in json format

import pdb

import json

import requests

import datetime

from bs4 import BeautifulSoup

page_link = 'https://jobs.github.com/positions?page=0'

page_response = requests.get(page_link, timeout=9999)

page_content = BeautifulSoup(page_response.content, "html.parser")

response = []

for i in range(100):

    print(i)

    page_link = 'https://jobs.github.com/positions?page=' + str(i)

    page_response = requests.get(page_link, timeout=9999)

    page_content = BeautifulSoup(page_response.content, "html.parser")

    data = page_content.find('div', attrs={'id': 'page'})

    if data.find('h1').text.strip() == 'Nothing found':

        break

    data = page_content.findAll('table', attrs={'class': 'positionlist'})

    table_tr = data[0].findAll('tr')

    for row in table_tr:

        response_obj = {}

        row_td = row.find('td', {'class': 'title'})

        try:

            job_title = row_td.find('h4').text

        except:

            break

        company = row_td.find('a', {'class': 'company'}).text

        if 'fulltime' in row_td.find('strong')['class']:

            fulltime = row_td.find('strong', {'class': 'fulltime'}).text

        elif 'parttime' in row_td.find('strong')['class']:

            fulltime = row_td.find('strong', {'class': 'parttime'}).text

        elif 'contract' in row_td.find('strong')['class']:

            fulltime = row_td.find('strong', {'class': 'contract'}).text

        row_td = row.find('td', {'class': 'meta'})

        location = row_td.find('span', {'class': 'location'}).text

        timezone = row_td.find('span', {'class': 'when'}).text

        response_obj.update({'title': job_title,

                            'company': company,

                            'job_type': fulltime,

                            'location': location,

                            'timezone': timezone,

                            })

        response.append(response_obj)

with open('github_response.json', 'w') as outfile:

    json.dump(response, outfile)

Python Code : Convert Image to text

'''

image to text converter....

sudo apt install tesseract-ocr

pip install Pillow

pip intall pytesseract

'''

from PIL import Image

from pytesseract import image_to_string

print(image_to_string(Image.open('ocr_scrap.jpg'), lang='eng'))

Scrap Indeed job lists data and save in Excel file

import pdb

import json

import requests

import datetime

import xlsxwriter

from bs4 import BeautifulSoup

def save_job_details(category, job_list, row, col):

    job_company = category.find('span', {'class': 'icl-NavigationList-primaryText '}).text

    for job in job_list:

        job_name = job.find('h2').text.strip()

        company_name = job.find('span', {'class':'company'}).text.strip()

        location = job.find('span', {'class':'location'}).text.strip()

        date = job.find('span', {'class':'date'}).text.strip()

        detail_page_link = 'https://www.indeed.co.uk' + job.find('a', {'class':'turnstileLink'})['href']

        detail_page_response = requests.get(detail_page_link, timeout=9999)

        detail_page_content = BeautifulSoup(detail_page_response.content, "html.parser")

        description = detail_page_content.find('div', attrs={'class': 'jobsearch-JobComponent-description icl-u-xs-mt--md'}).text

        worksheet.write(row, col, job_company)

        worksheet.write(row, col + 1, job_name)

        worksheet.write(row, col + 2, company_name)

        worksheet.write(row, col + 3, location)

        worksheet.write(row, col + 4, description)

        worksheet.write(row, col + 5, date)

        worksheet.write(row, col + 6, detail_page_link)

        row += 1

page_link = 'https://www.indeed.co.uk/?sq=1'

page_response = requests.get(page_link, timeout=9999)

page_content = BeautifulSoup(page_response.content, "html.parser")

workbook = xlsxwriter.Workbook('jobs_list.xlsx')

worksheet = workbook.add_worksheet()

row = 0

col = 0

worksheet.write(row, col, 'Job Category')

worksheet.write(row, col + 1, 'Job Name')

worksheet.write(row, col + 2, 'Company Name')

worksheet.write(row, col + 3, 'Location')

worksheet.write(row, col + 4, 'Description')

worksheet.write(row, col + 5, 'Date')

worksheet.write(row, col + 6, 'Job Detail Link')

row += 1

categories = page_content.findAll('li', attrs={'class': 'icl-NavigationList-item'})

for category in categories:

    page_link = 'https://www.indeed.co.uk' + category.find('a')['href']

    page_response = requests.get(page_link, timeout=9999)

    page_content = BeautifulSoup(page_response.content, "html.parser")

    job_list = page_content.findAll('div', {'class': 'jobsearch-SerpJobCard row result'})

    save_job_details(category, job_list, row, col)

    check_next = page_content.find('div', {'class':'pagination'})

    next_page = check_next.findAll('a')[-1]

    check_span = next_page.find('span')

    if 'pn' in check_span['class']:

        page_link = 'https://www.indeed.co.uk' + next_page['href']

        page_response = requests.get(page_link, timeout=9999)

        page_content = BeautifulSoup(page_response.content, "html.parser")

        job_list = page_content.findAll('div', {'class': 'jobsearch-SerpJobCard row result'})

        save_job_details(category, job_list, row, col)

workbook.close()

Scrap Instagram all users profile data

import pdb

import json

import requests

import datetime

from bs4 import BeautifulSoup

import xlsxwriter

import time

row1 = 0

row = 0

col = 0

workbook = xlsxwriter.Workbook('instagram_users_data.xlsx')

worksheet = workbook.add_worksheet()

worksheet.write(row, col, 'Name')

worksheet.write(row, col + 1, 'Handle')

worksheet.write(row, col + 2, 'Email')

worksheet.write(row, col + 3, 'Category')

worksheet.write(row, col + 4, 'No Of Followers')

worksheet.write(row, col + 5, 'Average Likes')

worksheet.write(row, col + 6, 'Total Posts')

workbook2 = xlsxwriter.Workbook('instagram_all_users_data.xlsx')

worksheet2 = workbook2.add_worksheet()

worksheet2.write(row1, col, 'Name')

worksheet2.write(row1, col + 1, 'Handle')

worksheet2.write(row1, col + 2, 'Email')

worksheet2.write(row1, col + 3, 'Phone Number')

worksheet2.write(row1, col + 4, 'Category')

worksheet2.write(row1, col + 5, 'No Of Followers')

worksheet2.write(row1, col + 6, 'No Of Following')

worksheet2.write(row1, col + 7, 'Average Likes')

worksheet2.write(row1, col + 8, 'Total Posts')

worksheet2.write(row1, col + 9, 'Profile Url')

row += 1

row1 += 1

for i in range(100):

    for j in range(10):

        try:

            print(j)

            list_link = 'https://www.instagram.com/directory/profiles/'+ str(i) + '-' +str(j)+ '/'

            list_response = requests.get(list_link, timeout=9999)

            list_content = BeautifulSoup(list_response.content, "html.parser")

            script_list = list_content.findAll('script')

            script_list = script_list[3].text

            script_list = script_list.replace(';', '')

            script_list = script_list.replace('window._sharedData = ', '')

            all_data = json.loads(script_list)

            all_data = all_data['entry_data']['ProfilesDirectoryPage'][0]['profile_data']['profile_list']

            all_data = json.loads(all_data)

            for data in all_data:

                try:

                    page_link = 'https://www.instagram.com/' + str(data) + '/'

                    print(page_link)

                    page_response = requests.get(page_link, timeout=9999)

                    page_content = BeautifulSoup(page_response.content, "html.parser")

                    script_tag = page_content.findAll('script')

                    script_tag = script_tag[3].text

                    clean_script = script_tag.replace(';', '')

                    clean_script = clean_script.replace('window._sharedData = ', '')

                    try:

                        json_data = json.loads(clean_script)

                        no_of_followers = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_followed_by']['count']

                        is_business_account = phone_number = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['is_business_account']

                        if is_business_account and no_of_followers > 250 and no_of_followers < 20000 and total_posts > 15:

                            total_posts = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['count']

                            name = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['full_name']

                            handle = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['username']

                            email = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['business_email']

                            phone_number = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['business_phone_number']

                            category = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['business_category_name']

                            no_of_following = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_follow']['count']

                            average_likes_per_posts = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][0]['node']['edge_liked_by']['count']

                            average_comments_per_posts = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'][0]['node']['edge_media_to_comment']['count']

                            worksheet.write(row, col, name)

                            worksheet.write(row, col + 1, handle)

                            worksheet.write(row, col + 2, email)

                            worksheet.write(row, col + 3, category)

                            worksheet.write(row, col + 4, no_of_followers)

                            worksheet.write(row, col + 5, average_likes_per_posts)

                            worksheet.write(row, col + 6, total_posts)

                            row += 1

                        worksheet2.write(row1, col, name)

                        worksheet2.write(row1, col + 1, handle)

                        worksheet2.write(row1, col + 2, email)

                        worksheet2.write(row1, col + 3, phone_number)

                        worksheet2.write(row1, col + 4, category)

                        worksheet2.write(row1, col + 5, no_of_followers)

                        worksheet2.write(row1, col + 6, no_of_following)

                        worksheet2.write(row1, col + 7, average_likes_per_posts)

                        worksheet2.write(row1, col + 8, total_posts)

                        worksheet2.write(row1, col + 9, page_link)

                        row1 += 1

                    except:

                        pass

                except:

                    time.sleep(200)

                    pass

        except:

            time.sleep(400)

            pass

workbook.close()

How to scrap all job list from https://stackoverflow.com/jobs

'''

 Scrap Stackoverflow all jobs and save response in a file in json format

''' 

import json

import requests

import datetime

from bs4 import BeautifulSoup

page_link = 'https://stackoverflow.com/jobs?sort=p'

page_response = requests.get(page_link, timeout=9999)

page_content = BeautifulSoup(page_response.content, "html.parser")

data = page_content.findAll('a', attrs={'class': 'job-link'})[-2]

loop_count = data.find('span').text

response = []

loop_count = 1

for i in range(int(loop_count)):

    page_link = 'https://stackoverflow.com/jobs?sort=p&pg='+ str(i)

    page_response = requests.get(page_link, timeout=9999)

    page_content = BeautifulSoup(page_response.content, "html.parser")

    data = page_content.findAll('a', attrs={'class': 's-link__visited'})

    for d in data:

        job_page_link = 'https://stackoverflow.com' + d['href']

        response_obj = {}

        job_page_response = requests.get(job_page_link, timeout=9999)

        job_page_content = BeautifulSoup(

            job_page_response.content, "html.parser")

        job_header = job_page_content.findAll('header', {'class': 'job-details--header'})

        job_header = job_header[0]

        # ----- Job Details ---- #

        company_logo = job_header.find('div', {'class': 's-avatar'})

        company_logo = company_logo.find('img')

        company_logo = company_logo['src']

        job_title = job_header.find('h1', {'class': 'fs-headline1'})

        job_title = job_title.find('a').text

        company_info = job_page_content.find(

            'div', attrs={'class': 'fc-black-700 fs-body3'}).text

        company_info = company_info.split('\n')

        salary = ''

        location_remote = ''

        location_visa = ''

        location_relocation = ''

        try:

            company_obj_list = job_header.find('div', {'class': 'mt12'})

            company_obj_list = company_obj_list.findAll('span')

            for company_obj in company_obj_list:

                if '-salary' in company_obj['class']:

                    salary = company_obj.text.strip()

                if '-remote' in company_obj['class']:

                    location_remote = company_obj.text.strip()

                if '-visa' in company_obj['class']:

                    location_visa = company_obj.text.strip()

                if '-relocation' in company_obj['class']:

                    location_relocation = company_obj.text.strip()

        except:

            pass

        date_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        response_obj.update({'timestamp': date_time,

                             'job_url': job_page_link,

                             'company_logo': company_logo,

                             'job_title': job_title,

                             'company_name': company_info[1],

                             'company_location': company_info[3],

                             'salary': salary,

                             'remote' : location_remote,

                             'visa' : location_visa,

                             'relocation': location_relocation})

        job_data = job_page_content.find('div', attrs={'id': 'overview-items'})

        sections = job_data.findAll('section', {'class': 'mb32'})

        # ---- Section One About this job -------- #

        section_1 = sections[0]

        about_job = section_1.findAll('div', {'class': 'mb8'})

        job_headings = section_1.find('h2', {'class': 'fs-subheading'}).text

        for ajob in about_job:

            headings = ajob.findAll('span')

            label = headings[0].text.lstrip().rstrip().replace(':', '').replace(' ', '_').lower()

            values = headings[1].text

            response_obj.update({label: values})

        # ------ Section Two Technologies ----- #

        section_2 = sections[1]

        job_headings = section_2.find('h2', {'class': 'fs-subheading'}).text

        technologies = section_2.findAll('a', {'class': 'job-link'})

        tech_stack = ''

        for tech in technologies:

            tech_stack = tech_stack + tech.text + ', '

        response_obj.update({'technologies': tech_stack })

        # ------ Section Three Job description ----- #

        section_3 = sections[2]

        job_headings = section_3.find('h2', {'class': 'fs-subheading'}).text

        job_description = section_3.find('div').text

        response_obj.update({'job_description': job_description})

        response.append(response_obj)

with open('response.json', 'w') as outfile:

    json.dump(response, outfile)