விக்கிமூலம்:பைத்தான்3நிரல்கள்/பகுப்புப்பக்கங்களைஎடுத்தல்

விக்கிமூலம் இல் இருந்து
Jump to navigation Jump to search
from pprint import pprint
from urllib.parse import urljoin

from bs4 import BeautifulSoup
import requests, csv, time
# subprocess,pywikibot, re
# https://stackoverflow.com/questions/41391168/pythonic-beautifulsoup4-how-to-get-remaining-titles-from-the-next-page-link-of
# https://ta.wikisource.org/s/3v3 (wiki short url)
WAIT_TIME = 5
# getting the wikipageName one by one
with open('1-catNames.csv', 'r') as csvInputFile:
	reading = csv.reader(csvInputFile,delimiter="~")
	count = 0  
	for csvWord in reading:
#		if len(row) == 1:
#		if not 'booktitle' in row:   
#		if count<=99:
			wikiPageName = csvWord[0]#.decode('utf-8')
			
			base_url = 'https://ta.wikisource.org/wiki/பகுப்பு:' + wikiPageName

			def get_next_link(soup):
				nextPages = soup.find("a", text="அடுத்த பக்கம்")
				return nextPages

			def extract_links(soup):
				theTitles = [a['title'] for a in soup.select("#mw-pages li a")]
				return theTitles

			with requests.Session() as session:
				content = session.get(base_url).content
				soup = BeautifulSoup(content, 'lxml')
				links = extract_links(soup)
#				print(*links, sep="\n")
				for item in links:
					print(item)
					time.sleep(WAIT_TIME)
					
					# writing the category pages as a output file.In csv, the default delimiter comma comes. so, writing the file as txt 
					#with open('1-catPages.csv', 'a') as csvOutputFile:
						#writing = csv.writer(OutputFile)
						#writing.writerow(item)
					OutputFile = open('1-catPages-bs4.txt','a')
					OutputFile.write(item + '\n')
					OutputFile.close()
						
					next_link = get_next_link(soup)
					while next_link is not None:  # while there is a Next Page link
						url = urljoin(base_url, next_link['href'])
						content = session.get(url).content
						soup = BeautifulSoup(content, 'lxml')
						
						links += extract_links(soup)
						
						next_link = get_next_link(soup)
						
#						print(links)