Here is the code:
#!/usr/bin/env python
import re
import os
import sys
import urllib2
class URLRepo:
URL = None # holds this URL's address, as is the parent of the son URLs it will hold
def __init__(self, URL):
""" A Class representing the collection of all son URLs a parent URL holds.
URL is the internet URL to prcess."""
self.URL = URL
raw_html = urllib2.urlopen(URL).read() # note this completely disregards proper HTTP workflow e.g. proper GET headers
self.html = raw_html
emailre = re.compile('[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}',re.IGNORECASE)
linkre = re.compile('.*(.*)<\/a>') # match the URL and its corrsponding title
links = linkre.findall(raw_html)
emails = emailre.findall(raw_html)
self.emails = []
# filter out any link that may return us to the same parent URL and clean out email links
self.links = [i for i in links if (not i[0].startswith('/') and
not i[0].startswith('mailto') and
not i[0].startswith('#') and
not i[0].startswith(self.URL) and
(i[0].startswith('http') or
i[0].startswith('www')))]
for eml in emails:
for eml in emails:
try:
self.emails.index(eml)
except:
self.emails.append(eml)
if __name__ == '__main__':
SEED_URL = sys.argv[1]
A = URLRepo(SEED_URL)
print "List of links:"
print "--------------"
for link in A.links:
print link
print "List of Email addresses"
print "-----------------------"
for eml in A.emails:
print eml
No comments:
Post a Comment