One of the essential steps in the Page Purpose Project was collecting the content from all of the web pages to feed into the Natural Language Processing model. The simplest solution was to build a web scraper to collect all page content from the site.

Below is a demo of the code I used to collect the page content for this project.

Coding Demo

#importing the libraries 
import requests
from bs4 import BeautifulSoup
import pandas
#open and create a list of strings from CSV URL's
with open("site_map_links.csv") as f:
content= f.readlines()
content=[x.strip() for x in content]
#creating full hyperlink from link list
link_list=[]
i=0
for link in content:
l="https://www.YOURWEBSITE{}".format(content[i]) i=i+1 link_list.append(l)
#loop to scrape the website
data ={"Page URL":[], "Page Content":[]}
for page in link_list:
page_content=[]
div=0
i=0
try:
r=requests.get(page)
c=r.content
soup=BeautifulSoup(c, "html.parser")
all =soup.find_all("div",{"class":"rich-text-component"})
for item in all:
try:
for item in all:
try
: d = all[div].find_all("p")[i].text
page_content.append(d)
except:
pass
i=i+1
except:
pass
div=div+1
i=0
except:
pass
print(page)
full_page_content = ["".join(page_content)]
data["Page URL"].append(page)
data["Page Content"].append(full_page_content)
#saving data to a pandas data frame
df=pandas.DataFrame(data)
#saving your data to a csv file
df.to_csv("Page_Content.csv", index=False)