விக்கிமூலம்:பைத்தான்3நிரல்கள்/பகுப்புப்பக்கங்களைஎடுத்தல்
Appearance
from pprint import pprint
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests, csv, time
# subprocess,pywikibot, re
# https://stackoverflow.com/questions/41391168/pythonic-beautifulsoup4-how-to-get-remaining-titles-from-the-next-page-link-of
# https://ta.wikisource.org/s/3v3 (wiki short url)
WAIT_TIME = 5
# getting the wikipageName one by one
with open('1-catNames.csv', 'r') as csvInputFile:
reading = csv.reader(csvInputFile,delimiter="~")
count = 0
for csvWord in reading:
# if len(row) == 1:
# if not 'booktitle' in row:
# if count<=99:
wikiPageName = csvWord[0]#.decode('utf-8')
base_url = 'https://ta.wikisource.org/wiki/பகுப்பு:' + wikiPageName
def get_next_link(soup):
nextPages = soup.find("a", text="அடுத்த பக்கம்")
return nextPages
def extract_links(soup):
theTitles = [a['title'] for a in soup.select("#mw-pages li a")]
return theTitles
with requests.Session() as session:
content = session.get(base_url).content
soup = BeautifulSoup(content, 'lxml')
links = extract_links(soup)
# print(*links, sep="\n")
for item in links:
print(item)
time.sleep(WAIT_TIME)
# writing the category pages as a output file.In csv, the default delimiter comma comes. so, writing the file as txt
#with open('1-catPages.csv', 'a') as csvOutputFile:
#writing = csv.writer(OutputFile)
#writing.writerow(item)
OutputFile = open('1-catPages-bs4.txt','a')
OutputFile.write(item + '\n')
OutputFile.close()
next_link = get_next_link(soup)
while next_link is not None: # while there is a Next Page link
url = urljoin(base_url, next_link['href'])
content = session.get(url).content
soup = BeautifulSoup(content, 'lxml')
links += extract_links(soup)
next_link = get_next_link(soup)
# print(links)