Don't like ads? PRO users don't see any ads ;-)
Guest

URL sorter mk. 14

By: waterapple on Mar 29th, 2013  |  syntax: Python  |  size: 8.57 KB  |  hits: 25  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #-------------------------------------------------------------------------------
  2. # Name:        linksorter
  3. # Purpose:      Sorting urls from a file into lists for each site
  4. #
  5. # Author:      new
  6. #
  7. # Created:     20/03/2013
  8. # Copyright:   (c) new 2013
  9. # Licence:     <your licence>
  10. #-------------------------------------------------------------------------------
  11. #!/usr/bin/env python
  12.  
  13. import re
  14. import urlparse
  15. import os
  16. import BeautifulSoup
  17.  
  18. def uniquify(seq):
  19.     # List uniquifier from
  20.     # http://www.peterbe.com/plog/uniqifiers-benchmark
  21.     # order preserving
  22.     checked = []
  23.     for e in seq:
  24.         if e not in checked:
  25.             checked.append(e)
  26.     return checked
  27.  
  28. def extractlinks(html):
  29.     # Copied from:
  30.     # http://stackoverflow.com/questions/520031/whats-the-cleanest-way-to-extract-urls-from-a-string-using-python
  31.     url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
  32.     links = re.findall(url_regex,html, re.DOTALL)
  33.     return links
  34.  
  35. def load_textfile(filepath):
  36.     # Return data in specified file, if no file found create it.
  37.     new_file_text = 'Put text containing URLs here'
  38.     if os.path.exists(filepath):
  39.         f = open(filepath,'rU')
  40.         file_data = f.read()
  41.         f.close()
  42.         print 'File loaded'
  43.         return file_data
  44.     else:
  45.         f = open(filepath,'w')
  46.         f.write(new_file_text)
  47.         f.close()
  48.  
  49. def save_text(filepath,data):
  50.     print 'save_text:filepath', filepath
  51.     save_dir = os.path.dirname(filepath)
  52.     print 'save_text:save_dir', save_dir
  53.     if not os.path.exists(save_dir):
  54.         os.makedirs(save_dir)
  55.     f = open(filepath,'w')
  56.     f.write(data)
  57.     f.close()
  58.     print 'save_text:Saved data to file', filepath
  59.  
  60. def extract_domain(url):
  61.     #return the domain from given url
  62.     # print 'extract_domain:url', url
  63.     full_domain = urlparse.urlparse(url).netloc
  64.     # print 'extract_domain:full_domain', full_domain
  65.     # Handle known problem cases
  66.     # DeviantArt.com
  67.     if 'deviantart.com' in full_domain:
  68.         short_domain = re.sub('.+deviantart.com', 'deviantart.com', full_domain)
  69.         return short_domain
  70.     # Tumblr.com
  71.     elif '.tumblr.com' in full_domain:
  72.         short_domain = re.sub('.+tumblr.com', 'tumblr.com', full_domain)
  73.         return short_domain
  74.     else:
  75.         return full_domain
  76.  
  77.  
  78. def sanitize_filename(filename):
  79.     # Sanitize a filename (not a path)
  80.     sanitized_filename = re.sub('[^\./a-zA-Z0-9_-]+', '', filename)
  81.     return sanitized_filename
  82.  
  83. def build_link_dict(unsorted_data):
  84.     #turn a string with urls in it into a dict using format {'DomainName.com':['url1','url2']}
  85.     url_list = extractlinks(unsorted_data)
  86.     print 'url_list', url_list
  87.     sorting_dict = {}# {'DomainOne.com':['url1','url2']}
  88.     for url in url_list:
  89.         # print 'url',  url
  90.         url_domain = extract_domain(url)
  91.         # print 'url_domain', url_domain
  92.         if url_domain not in sorting_dict.keys():
  93.             sorting_dict[url_domain] = []
  94.         sorting_dict[url_domain].append(url)
  95.     return sorting_dict
  96.  
  97.  
  98. def export_urls_from_file(input_file_path='paste_here.txt'):
  99.     #read the specified text file and output a list of links for each domain
  100.     unsorted_data = load_textfile(input_file_path)
  101.     print 'unsorted_data', unsorted_data
  102.     link_dict = build_link_dict(unsorted_data)
  103.     for domain_key in link_dict.keys():
  104.         print 'domain_key', domain_key
  105.         output_filename = sanitize_filename(domain_key) + '.txt'
  106.         output_data = ''
  107.         for output_url in link_dict[domain_key]:
  108.             output_data += (output_url + '\n')
  109.         print 'output_data', output_data
  110.         output_path = 'output/' + output_filename
  111.         save_text(output_path,output_data)
  112.  
  113. # Converter functions; These take URLs and return usernames for that site
  114. def deviantart_convert(url):
  115.     # Turn a DeviantArt URL into a DeviantArt username.
  116.     # Valid URL examples:
  117.     # http://ssenarrya.deviantart.com/
  118.     # https://nawa88.deviantart.com/art/Pinkie-Pie-s-after-party-at-night-rule-34-313639046
  119.     pattern = r'https?://(.+?)\.deviantart.com'
  120.     username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
  121.     if username_search:
  122.         username = username_search.group(1)
  123.         return username
  124.  
  125. def furaffinity_convert(url):
  126.     # Turn a furaffinity URL into a furaffinity username.
  127.     # Valid URL examples:
  128.     # http://www.furaffinity.net/user/scorpdk/
  129.     pattern = r'furaffinity\.net/user/(.+?)/?'
  130.     username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
  131.     if username_search:
  132.         username = username_search.group(1)
  133.         return username
  134.  
  135. def inkbunny_convert(url):
  136.     # Turn an InkBunny URL into an InkBunny username.
  137.     # Valid URL examples:
  138.     # https://inkbunny.net/nargleflex
  139.     # Watch out for submission pages when calling this
  140.     pattern = r'inkbunny\.net/([^/]+)'
  141.     username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
  142.     if username_search:
  143.         username = username_search.group(1)
  144.         return username
  145.  
  146. def tumblr_convert(url):
  147.     # Turn a Tumblr URL into a Tumblr username.
  148.     # Valid URL examples:
  149.     # http://peanutbtter.tumblr.com/
  150.     # Sometimes tumblr blogs use their own domain instead of tumblr, this will not work on those.
  151.     pattern = r'https?://(?:www\.)(.+?)\.tumblr\.com/'
  152.     username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
  153.     if username_search:
  154.         username = username_search.group(1)
  155.         return username
  156.  
  157. def pixiv_convert(url):# TODO
  158.     # Turn a Pixiv URL into a Pixiv UserID.
  159.     # Valid URL examples:
  160.     #http://www.pixiv.net/member.php?id=312468
  161.     #http://www.pixiv.net/bookmark.php?id=293363&rest=show&p=3
  162.     #http://www.pixiv.net/member_illust.php?id=2947383
  163.     patterns = [
  164.     r'pixiv\.net/member.\php\?id=(\d+)',
  165.     r'pixiv\.net/bookmark\.php\?id=(\d+)',
  166.     r'pixiv\.net/member_illust\.php\?id=(\d+)',
  167.     r''
  168.     ]
  169.     for pattern in patterns:
  170.         username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
  171.         if username_search:
  172.             username = username_search.group(1)
  173.             return username
  174.  
  175. def aryion_convert(url):# TODO
  176.     # Turn an Ekas Portal URL into an Ekas Portal username.
  177.     # Valid URL examples:
  178.     #http://aryion.com/g4/user/GTSdev
  179.     pattern = r'aryion\.com/g4/user/(.+)'
  180.     username_search = re.search(pattern,url, re.DOTALL | re.IGNORECASE)
  181.     if username_search:
  182.         username = username_search.group(1)
  183.         return username
  184.  
  185. # End converter functions
  186.  
  187.  
  188. def export_usernames_from_file(input_file_path='paste_here.txt'):
  189.     #read the specified text file and output a list of usernames for each recognized domain
  190.  
  191.     unsorted_data = load_textfile(input_file_path)
  192.     print 'unsorted_data', unsorted_data
  193.     link_dict = build_link_dict(unsorted_data)
  194.     for domain_key in link_dict.keys():
  195.         print 'domain_key', domain_key
  196.         output_filename = sanitize_filename(domain_key) + '.txt'
  197.         domain_lines = []
  198.         for output_url in link_dict[domain_key]:
  199.             #print 'output_url', output_url
  200.             # Handle DeviantArt
  201.             if domain_key == 'deviantart.com':
  202.                 domain_lines.append(deviantart_convert(output_url))
  203.             # Handle Furaffinity
  204.             elif domain_key == 'furaffinity.net':
  205.                 domain_lines.append(furaffinity_convert(output_url))
  206.             # Handle Inkbunny
  207.             elif domain_key == 'inkbunny.net':
  208.                 if '.php' not in output_url:
  209.                     domain_lines.append(inkbunny_convert(output_url))
  210.             # Handle Pixiv
  211.             elif domain_key == 'pixiv.net':
  212.                 if 'pixiv.net/member.php' in output_url:
  213.                     domain_lines.append(pixiv_convert(output_url))
  214.             # Handle Ekas Portal
  215.             elif domain_key == 'aryion.com':
  216.                 domain_lines.append(aryion_convert(output_url))
  217.             # If no handler
  218.             else:
  219.                 domain_lines.append(output_url)
  220.         # print 'domain_lines', domain_lines
  221.         unique_domain_lines = uniquify(domain_lines)
  222.         output_string = ''
  223.         # Assemble output string from URL strings
  224.         for line in unique_domain_lines:
  225.             output_string += str(line) + '\n'
  226.         # print 'output_string', output_string
  227.         output_path = 'parsed_output/' + output_filename
  228.         save_text(output_path, output_string)
  229.  
  230. def main():
  231.     #export_urls_from_file()
  232.     export_usernames_from_file()
  233.  
  234. if __name__ == '__main__':
  235.     main()