I created this in order to have a bootstrap for wanting to create a script to generate an excel spreadsheet from scraped information from an array of urls. Comments within the script indicate which values/columns/headers need to be updated.
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import xlsxwriter import numpy as np import os import contextlib from bs4 import BeautifulSoup import requests # Empty array of urls to be parsed arrofurls = [] # would then call getvaluesperurl with above arrofurls. # Empty multi-dimension array- only initial values are the headers arrofitems = [['NAME', 'ADDRESS', 'CITY']] # Deleting any existing spreadsheet before performing operation with contextlib.suppress(FileNotFoundError): os.remove('Scrape-output.xlsx') # Create a workbook and add a worksheet. workbook = xlsxwriter.Workbook('Scrape-output.xlsx') worksheet = workbook.add_worksheet() # Increment row to start at 1 for loop since headers were added # Start from the first cell. Rows and columns are zero indexed. row = 0 col = 0 # Iterate over the data and write it out row by row. - need to change name1, address1 etc for what column values will be for name1, address1, city1 in (arrofitems): worksheet.write(row, col, name1) worksheet.write(row, col + 1, address1) worksheet.write(row, col + 2, city1) row += 1 # Write a total using a formula. Commented out since not needed for the example # worksheet.write(row, 0, 'Total') # worksheet.write(row, 1, '=SUM(B1:B4)') workbook.close() # function that goes through urls and grabs values to add in main array # values and size o main array depends on the information needed # below is just an example of a single value being def getvaluesperurl(arrayofurls): for url in arrayofurls: r = requests.get(url) data1 = r.text soup = BeautifulSoup(data1, "lxml") # go thru each value and find via select and check if not none valueofitem = soup.select_one( "#school-single > div.ccsf-content-wrap > div > ul > li:nth-of-type(16) > div.ccsf-value").text if valueofitem is None: valueofitem = 'NA' valueofitem2 = soup.select_one( "#school-single > div.ccsf-content-wrap > div > ul > li:nth-of-type(16) > div.ccsf-value").text if valueofitem2 is None: valueofitem2 = 'NA' valueofitem3 = soup.select_one( "#school-single > div.ccsf-content-wrap > div > ul > li:nth-of-type(16) > div.ccsf-value").text if valueofitem3 is None: valueofitem3 = 'NA' # after finding all values, add to the main array temparr = np.array([[valueofitem, valueofitem2, valueofitem3]]) arrofitems = np.concatenate((arrofitems, temparr), axis=0) |