Using WorkerPool to multithread through a list of URLs

516 views Asked by At

I'm trying to use multithreads to go through a txt file of urls and scrape the contents found at each url. This works for about 20 URLs (not consistent how many) but then consistently gets stuck on the last url in the file. It doesn't seem to be doing them in order.

I have no idea why it's getting stuck or where to start so thank you so much for your help.

from bs4 import BeautifulSoup, SoupStrainer
import urllib3
import urllib2
import io
import os
import re
import workerpool
from urllib2 import Request, urlopen, URLError

NUM_SOCKETS = 3
NUM_WORKERS = 5

urlfile = open("dailynewsurls.txt",'r') # read one line at a time until end of file
http = urllib3.PoolManager(maxsize=NUM_SOCKETS)
workers = workerpool.WorkerPool(size=NUM_WORKERS)

class MyJob(workerpool.Job):
    def __init__(self, url):
       self.url = url

    def run(self):
        r = http.request('GET', self.url)
        req = urllib2.Request(url)
        try:
            page = urllib2.urlopen(req)
        except:
            print "had to skip one"
            return            
        pagecontent = page.read() # get a file-like object at this url

#this tells it to soup the page that is at the url above
        soup = BeautifulSoup(pagecontent)

#this tells it to find the string in the first instance of each of the tags in the parenthesis
        title = soup.find_all('title')
        article = soup.find_all('article')


        try:
            title = str(title[0].get_text().encode('utf-8'))
        except:
            print "had to skip one"
            return
        try:
            article = str(article[0].get_text().encode('utf-8'))
        except:
            print "had to skip one"
            return

        try: 
    # make the file using the things above
            output_files_pathname = 'DailyNews/'  # path where output will go
            new_filename = title + ".txt"

# write each of the things defined into the text file
            outfile = open(output_files_pathname + new_filename,'w')
            outfile.write(title)
            outfile.write("\n")
            outfile.write(article)
            outfile.close()
            print "%r added as a text file" % title
            return

        except:
            print "had to skip one"
            return

        return

for url in urlfile:  
    workers.put(MyJob(url))  

workers.shutdown()
workers.wait()

print "All done."

Here's an example list of the urls:

http://www.nydailynews.com/entertainment/tv-movies/x-factor-season-2-episode-2-recap-oops-britney-spears-article-1.1159546
http://www.nydailynews.com/new-york/brooklyn/lois-mclohon-resurfaced-iconic-daily-news-coney-island-cheesecake-photo-brings-back-memories-50-year-long-romance-article-1.1160457
http://www.nydailynews.com/new-york/uptown/espaillat-linares-rivals-bitter-history-battle-state-senate-seat-article-1.1157994
http://www.nydailynews.com/sports/baseball/mlb-power-rankings-yankees-split-orioles-tumble-rankings-nationals-shut-stephen-strasburg-hang-top-spot-article-1.1155953
http://www.nydailynews.com/news/national/salon-sell-internet-online-communities-article-1.1150614
http://www.nydailynews.com/sports/more-sports/jiyai-shin-wins-women-british-open-dominating-fashion-record-nine-shot-victory-article-1.1160894
http://www.nydailynews.com/entertainment/music-arts/justin-bieber-offered-hockey-contract-bakersfield-condors-minor-league-team-article-1.1157991
http://www.nydailynews.com/sports/baseball/yankees/umpire-blown-call-9th-inning-dooms-yankees-5-4-loss-baltimore-orioles-camden-yards-article-1.1155141
http://www.nydailynews.com/entertainment/gossip/kellie-pickler-shaving-head-support-best-friend-cancer-fight-hair-article-1.1160938
http://www.nydailynews.com/new-york/secret-103-000-settlement-staffers-accused-assemblyman-vito-lopez-sexual-harassment-included-penalty-20k-involved-talked-details-article-1.1157849
http://www.nydailynews.com/entertainment/tv-movies/ricki-lake-fun-adds-substance-new-syndicated-daytime-show-article-1.1153301
http://www.nydailynews.com/sports/college/matt-barkley-loyalty-usc-trojans-contention-bcs-national-championship-article-1.1152969
http://www.nydailynews.com/sports/daily-news-sports-photos-day-farewell-andy-roddick-world-1-u-s-open-champ-retires-loss-juan-martin-del-potro-article-1.1152827
http://www.nydailynews.com/entertainment/gossip/britney-spears-made-move-relationship-fiance-jason-trawick-reveals-article-1.1152722
http://www.nydailynews.com/new-york/brooklyn/brooklyn-lupus-center-tayumika-zurita-leads-local-battle-disease-difficult-adversary-article-1.1153494
http://www.nydailynews.com/life-style/fashion/kate-middleton-prabal-gurung-dress-sells-hour-myhabit-site-sold-1-995-dress-599-article-1.1161583
http://www.nydailynews.com/news/politics/obama-romney-campaigns-vie-advantage-president-maintains-lead-article-1.1161540
http://www.nydailynews.com/life-style/free-cheap-new-york-city-tuesday-sept-11-article-1.1155950
http://www.nydailynews.com/news/world/dozens-storm-embassy-compound-tunis-article-1.1159663
http://www.nydailynews.com/opinion/send-egypt-message-article-1.1157828
http://www.nydailynews.com/sports/more-sports/witnesses-feel-sheryl-crow-lance-amstrong-activities-article-1.1152899
http://www.nydailynews.com/sports/baseball/yankees/hiroki-kuroda-replacing-cc-sabathia-yankees-ace-pitcher-real-possibility-playoffs-looming-article-1.1161812
http://www.nydailynews.com/life-style/eats/finland-hosts-pop-down-restaurant-belly-earth-262-feet-underground-article-1.1151523
http://www.nydailynews.com/sports/more-sports/mighty-quinn-sept-23-article-1.1165584
http://www.nydailynews.com/sports/more-sports/jerry-king-lawler-stable-condition-suffering-heart-attack-wwe-raw-broadcast-monday-night-article-1.1156915
http://www.nydailynews.com/news/politics/ambassador-chris-stevens-breathing-libyans-found-american-consulate-rescue-article-1.1161454
http://www.nydailynews.com/news/crime/swiss-banker-bradley-birkenfeld-104-million-reward-irs-blowing-whistle-thousands-tax-dodgers-article-1.1156736
http://www.nydailynews.com/sports/hockey/nhl-board-governors-votes-favor-lockout-league-players-association-fail-reach-agreement-cba-article-1.1159131
http://www.nydailynews.com/news/national/iphone-5-works-t-network-article-1.1165543
http://www.nydailynews.com/sports/baseball/yankees/yankees-broadcasters-michael-kay-ken-singleton-opportunity-important-statement-article-1.1165479
http://www.nydailynews.com/news/national/boss-year-michigan-car-dealer-retires-employees-1-000-year-service-article-1.1156763
http://www.nydailynews.com/entertainment/tv-movies/hero-denzel-washington-clint-eastwood-article-1.1165538
http://www.nydailynews.com/sports/football/giants/ny-giants-secondary-roasted-tony-romo-dallas-cowboys-offense-article-1.1153055
http://www.nydailynews.com/news/national/hide-and-seek-tragedy-3-year-old-suffocates-hiding-bean-bag-article-1.1160138
1

There are 1 answers

0
Liam Horne On BEST ANSWER

I would try using the threading module; here is something I think is working:

from bs4 import BeautifulSoup, SoupStrainer
import threading
import urllib2

def fetch_url(url):
    urlHandler = urllib2.urlopen(url)
    html = urlHandler.read()
#this tells it to soup the page that is at the url above
    soup = BeautifulSoup(html)

#this tells it to find the string in the first instance of each of the tags in the parenthesis
    title = soup.find_all('title')
    article = soup.find_all('article')

    try:
        title = str(title[0].get_text().encode('utf-8'))
    except:
        print "had to skip one bad title\n"
        return
    try:
        article = str(article[0].get_text().encode('utf-8'))
    except:
        print "had to skip one bad article"
        return

    try:
# make the file using the things above
        output_files_pathname = 'DailyNews/'  # path where output will go
        new_filename = title + ".txt"

# write each of the things defined into the text file
        outfile = open(output_files_pathname + new_filename, 'w')
        outfile.write(title)
        outfile.write("\n")
        outfile.write(article)
        outfile.close()
        print "%r added as a text file" % title
        return

    except:
        print "had to skip one cant write file"
        return

    return
with open("dailynewsurls.txt", 'r') as urlfile:
# read one line at a time until end of file
    threads = [threading.Thread(target=fetch_url, args=(url,)) for url in urlfile]
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()