I wanted to get some data off an agent listing website and into a spreadsheet. I’d been meaning to play around with python for web scraping and this was the perfect excuse: There were just enough results that it would take longer to manually copy and paste them than to write a little python program. (I never want to automate something that will take less time to do than to automate, as long as I’m only going to do it once or twice…)
To get and post the search form, I used requests rather than urllib because dang! is it ever easier to work with. The requests session kept track of the requisite asp.net sessionID cookie without a single line of code on my part. I used BeautifulSoup to process the HTML result pages, and it was fun, if counterintuitive.
# import libraries import requests import re from bs4 import BeautifulSoup # the advanced search for agentquery.com is # https://agentquery.com/search_advanced.aspx # it's an aspx page which requires the session cookie to be set # so we'll make a requests session s = requests.Session() # first HTTP request without form data # get the form to set session cookie and get some hidden form values myurl = 'https://agentquery.com/search_advanced.aspx' f = s.get(myurl) # parse and retrieve three vital form values soup = BeautifulSoup(f.text) viewstate = soup.select("#__VIEWSTATE")[0]['value'] viewstategenerator = soup.select("#__VIEWSTATEGENERATOR")[0]['value'] eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value'] # fill the form data # Here are the boxes I want checked: # ctl00$chkFiction$15 is Middle Grade # ctl00$chkFiction$22 is Science Fiction # ctl00$chkFiction$8 is Fantasy # ctl00$btnSearch is the search button, must set to 'Search' for the POST to return results # ctl00$drpSeek is the drop-down for "ARE YOU LOOKING FOR AN AGENT WHO IS ACTIVELY SEEKING NEW CLIENTS?" mypayload = { '__EVENTVALIDATION': eventvalidation, '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR':viewstategenerator, 'ctl00$chkFiction$15': 'on', 'ctl00$chkFiction$22': 'on', 'ctl00$chkFiction$8': 'on', 'ctl00$btnSearch': 'Search', 'ctl00$drpSeek': 'Yes' } # Now we can make the second HTTP request with form data # this gets the first page of results f = s.post(myurl,mypayload) # open output file for writing try: file = open('tmp.csv', 'w') except: print('Could not open output file\n') getmore='true' while(getmore=='true'): # parse the html soup = BeautifulSoup(f.text) # sift out the agent data from this page results=soup.find_all(id=re.compile("dlResults_ctl.*(lnkAgent$|lnkAgency|lblEmail)")) # example output: # <a class="result" href="agent.aspx?agentid=1128" id="ctl00_dlResults_ctl00_lnkAgent">Suzie Townsend</a> # <a class="result" href="http://www.publishersmarketplace.com/members/sztownsend81//" id="ctl00_dlResults_ctl00_lnkAgency" target="_blank" rel="noopener noreferrer">New Leaf Literary and Media</a> # <span id="ctl00_dlResults_ctl00_lblEmail">query@newleafliterary.com, put QUERY--SUZIE in the subject line</span> for i in range(0,len(results),3): a_agentid=results[i].get_attribute_list('id')[0] a_agentlink=results[i].get_attribute_list('href')[0] a_agentname=results[i].text a_agencyid=results[i+1].get_attribute_list('id')[0] a_agencyurl=results[i+1].get_attribute_list('href')[0] a_agencyname=results[i+1].text a_email=results[i+2].text # the url may be blank if a_agencyurl==None: a_agencyurl='' # create a row, values delimited by "|" row=a_agentname + "|https://agentquery.com/" + a_agentlink + "|" + a_agencyname + "|" + a_agencyurl + "|" + a_email # print to screen print(row) # print to file file.write(row+"\n") # is there a next page? if so, load it. If not, stop. # the "Next" link is present only if there are more results & has id 'ctl00_Pager1_lbtnNext' n=soup.find(id='ctl00_Pager1_lbtnNext'); if n==None: getmore='false' elif n.text=='Next': myurl="https://agentquery.com/" + n.get_attribute_list('href')[0] f = s.post(myurl,mypayload) else: getmore='false' #while loop ends here file.close()