Python script to scrape array of urls and create an excel spreadsheet

I created this in order to have a bootstrap for wanting to create a script to generate an excel spreadsheet from scraped information from an array of urls. Comments within the script indicate which values/columns/headers need to be updated.

import xlsxwriter
import numpy as np
import os
import contextlib
from bs4 import BeautifulSoup
import requests

# Empty array of urls to be parsed
arrofurls = []
# would then call getvaluesperurl with above arrofurls.

# Empty multi-dimension array- only initial values are the headers
arrofitems = [['NAME', 'ADDRESS', 'CITY']]

# Deleting any existing spreadsheet before performing operation
with contextlib.suppress(FileNotFoundError):
    os.remove('Scrape-output.xlsx')

# Create a workbook and add a worksheet.
workbook = xlsxwriter.Workbook('Scrape-output.xlsx')
worksheet = workbook.add_worksheet()

# Increment row to start at 1 for loop since headers were added
# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0
# Iterate over the data and write it out row by row. - need to change name1, address1 etc for what column values will be
for name1, address1, city1 in (arrofitems):
    worksheet.write(row, col, name1)
    worksheet.write(row, col + 1, address1)
    worksheet.write(row, col + 2, city1)
    row += 1

# Write a total using a formula. Commented out since not needed for the example
# worksheet.write(row, 0, 'Total')
# worksheet.write(row, 1, '=SUM(B1:B4)')
workbook.close()


# function that goes through urls and grabs values to add in main array
# values and size o main array depends on the information needed
# below is just an example of a single value being
def getvaluesperurl(arrayofurls):
    for url in arrayofurls:
        r = requests.get(url)
        data1 = r.text
        soup = BeautifulSoup(data1, "lxml")
        # go thru each value and find via select and check if not none
        valueofitem = soup.select_one(
            "#school-single > div.ccsf-content-wrap > div > ul > li:nth-of-type(16) > div.ccsf-value").text
        if valueofitem is None:
            valueofitem = 'NA'
        valueofitem2 = soup.select_one(
            "#school-single > div.ccsf-content-wrap > div > ul > li:nth-of-type(16) > div.ccsf-value").text
        if valueofitem2 is None:
            valueofitem2 = 'NA'
        valueofitem3 = soup.select_one(
            "#school-single > div.ccsf-content-wrap > div > ul > li:nth-of-type(16) > div.ccsf-value").text
        if valueofitem3 is None:
            valueofitem3 = 'NA'

        # after finding all values, add to the main array
        temparr = np.array([[valueofitem, valueofitem2, valueofitem3]])
        arrofitems = np.concatenate((arrofitems, temparr), axis=0)

import xlsxwriter

import numpy as np

import os

import contextlib

from bs4 import BeautifulSoup

import requests

# Empty array of urls to be parsed

arrofurls = []

# would then call getvaluesperurl with above arrofurls.

# Empty multi-dimension array- only initial values are the headers

arrofitems = [['NAME', 'ADDRESS', 'CITY']]

# Deleting any existing spreadsheet before performing operation

with contextlib.suppress(FileNotFoundError):

os.remove('Scrape-output.xlsx')

# Create a workbook and add a worksheet.

workbook = xlsxwriter.Workbook('Scrape-output.xlsx')

worksheet = workbook.add_worksheet()

# Increment row to start at 1 for loop since headers were added

# Start from the first cell. Rows and columns are zero indexed.

row = 0

col = 0

# Iterate over the data and write it out row by row. - need to change name1, address1 etc for what column values will be

for name1, address1, city1 in (arrofitems):

worksheet.write(row, col, name1)

worksheet.write(row, col + 1, address1)

worksheet.write(row, col + 2, city1)

row += 1

# Write a total using a formula. Commented out since not needed for the example

# worksheet.write(row, 0, 'Total')

# worksheet.write(row, 1, '=SUM(B1:B4)')

workbook.close()

# function that goes through urls and grabs values to add in main array

# values and size o main array depends on the information needed

# below is just an example of a single value being

def getvaluesperurl(arrayofurls):

for url in arrayofurls:

r = requests.get(url)

data1 = r.text

soup = BeautifulSoup(data1, "lxml")

# go thru each value and find via select and check if not none

valueofitem = soup.select_one(

"#school-single > div.ccsf-content-wrap > div > ul > li:nth-of-type(16) > div.ccsf-value").text

if valueofitem is None:

valueofitem = 'NA'

valueofitem2 = soup.select_one(

"#school-single > div.ccsf-content-wrap > div > ul > li:nth-of-type(16) > div.ccsf-value").text

if valueofitem2 is None:

valueofitem2 = 'NA'

valueofitem3 = soup.select_one(

"#school-single > div.ccsf-content-wrap > div > ul > li:nth-of-type(16) > div.ccsf-value").text

if valueofitem3 is None:

valueofitem3 = 'NA'

# after finding all values, add to the main array

temparr = np.array([[valueofitem, valueofitem2, valueofitem3]])

arrofitems = np.concatenate((arrofitems, temparr), axis=0)

Online Journal

My online journal about design and development related topics and what I've learned.

Python script to scrape array of urls and create an excel spreadsheet