Thursday, November 22, 2018

Scrap data from Canadian Academy of Periodontology (http://www.cap-acp.ca) and export in Excel file.

import pdb
from string import ascii_lowercase

import requests

import xlsxwriter
from bs4 import BeautifulSoup

row = 0
col = 0

workbook = xlsxwriter.Workbook('users_data.xlsx')
worksheet = workbook.add_worksheet()

worksheet.write(row, col, 'Name')
worksheet.write(row, col + 1, 'Address')
worksheet.write(row, col + 2, 'Country')
worksheet.write(row, col + 3, 'PinCode')
worksheet.write(row, col + 4, 'Phone Number')
worksheet.write(row, col + 5, 'Fax')
worksheet.write(row, col + 6, 'Email ID')
worksheet.write(row, col + 7, 'Web Site')
row += 1

for ascii_char in ascii_lowercase:
try:
page_link = 'http://www.cap-acp.ca/en/search/index.php?page=4&search=' + ascii_char
page_response = requests.get(
page_link, timeout=9999)
page_content = BeautifulSoup(page_response.content, "html.parser")

data = page_content.findAll('div', attrs={'class': 'main'})

for div in data:
links = div.findAll('a')
for a in links:
if a['href'] != 'index.php':
child_page_response = requests.get(
"http://www.cap-acp.ca" + a['href'], timeout=5555)
child_page_content = BeautifulSoup(
child_page_response.content, "html.parser")

new_data = child_page_content.findAll(
'div', attrs={'class': 'panel'})
new_data = new_data[0].text
user = new_data.split('\n')

name = user[2]
address = user[3]
country = user[4]
pin_code = user[5]
phone_number = user[6]
fax = ''
if 'Fax' in user[7]:
fax = user[7]
email = ''
if 'E-mail' in user[8]:
email = user[8]
website = ''
if 'Web site' in user[9]:
website = user[9]

worksheet.write(row, col, name)
worksheet.write(row, col + 1, address)
worksheet.write(row, col + 2, country)
worksheet.write(row, col + 3, pin_code)
worksheet.write(row, col + 4, phone_number)
worksheet.write(row, col + 5, fax)
worksheet.write(row, col + 6, email)
worksheet.write(row, col + 7, website)

row += 1
except:
pass

workbook.close()

No comments:

Post a Comment