Getting a specific number from parsed text

73 views Asked by At

I am new at coding. This code works in two steps. In the first step, user types the name of the article he wants to find. Click on "Research Gate" button. This website gives us DOI number. Then in the second step, we search this DOI number by clicking "Sci-Hub" button. That's it.

What I am trying to do is to get this DOI number from parsed text. Then search DOI in the Sci-Hub. So, I presume we can do this with only one textbox and a button. So, when the user types the name of the article, it opens the founded result at Sci-Hub.

I wrote a little bit of web parsing below.

from PyQt5.QtWidgets import *
from PyQt5.QtWidgets import QApplication, QMainWindow, QMessageBox
from bs4 import BeautifulSoup
import sys
import webbrowser


def doi(event):
    
    if window.textbox.text() == "":
        QMessageBox.about(window, "Notification", "Please type the DOI of the article you want to find")
    else:
        lib = window.textbox.text()   
        url = "https://sci-hub.se/"+(str(lib))
        webbrowser.open_new(url)

def research():

    if window.textbox.text() == "":
        QMessageBox.about(window, "Notification", "Please type the name of the article")
    else:
        lib = window.textbox.text()
        url = "https://www.researchgate.net/search/publication?q="+(str(lib))
        webbrowser.open_new(url)

def quit_window():
    window.close()


app = QApplication(sys.argv)
window = QMainWindow()
window.setGeometry(500,300,300,300)
window.setWindowTitle("Publication Search")

window.textbox = QLineEdit(window)
window.textbox.setPlaceholderText("Please type the name of the article you want to find")
window.textbox.move(20, 70)
window.textbox.resize(270,30)

button1 = QPushButton(window)
button1.setText("ResearchGate")
button1.clicked.connect(research)
button1.move(100, 130)

button2 = QPushButton(window)
button2.setText("SCI-HUB")
button2.clicked.connect(doi)
button2.move(100, 170)
 
button3 = QPushButton(window)
button3.setText("Exit")
button3.clicked.connect(quit_window)
button3.move(100, 210)

window.show()
sys.exit(app.exec_())

Here is what I did to find parsed text from web site.

import requests 
import bs4
from bs4 import BeautifulSoup
  
text= "Mobile TV: a new form of entertainment?"
url = 'https://www.researchgate.net/search/publication?q=' + text   
result=requests.get(url)
soup = bs4.BeautifulSoup(result.text, "html.parser")
print(soup.get_text())
##print(soup.prettify())
1

There are 1 answers

0
furas On

I see DOI in <span class=""> so you can try soup.find_all('span') and later you can check if text starts with DOI:

I use get(url, params={'q': text}) instead of url&g=text and it will use special codes instead of spaces in text. Some servers may need it.

import requests 
from bs4 import BeautifulSoup
import webbrowser
  
text =  "Mobile TV: a new form of entertainment?"

payload = {
    'q': text,
}

count = 0

url = 'https://www.researchgate.net/search/publication' 
result = requests.get(url, params=payload)
#print(result.request.url)

soup = BeautifulSoup(result.text, "html.parser")
items = soup.find_all('span')

for span in items:
    doi = span.get_text(strip=True)
    if doi.startswith('DOI:'):
        print(doi)
        count += 1

        number = doi[4:] # skip `DOI:`
        url_sci_hub = "https://sci-hub.se/"+number

        webbrowser.open_new(url_sci_hub)

print('count:', count)

Code for many pages with results on researchgate.net

import requests 
from bs4 import BeautifulSoup
import webbrowser
  
text = "Mobile TV: a new form of entertainment?"

url = 'https://www.researchgate.net/search/publication' 

payload = {
    'q': text,
    'page': 0,
}
count = 0

for page in range(1, 5):
    print('--- page:', page, '---')

    payload['page'] = page
    result = requests.get(url, params=payload)
    #print(result.request.url)

    soup = BeautifulSoup(result.text, "html.parser")
    items = soup.find_all('span')

    for i in items:
        text = i.get_text(strip=True)
        if text.startswith('DOI:'):
            print(text)
            count += 1

            number = text[4:] # skip `DOI:`
            url_sci_hub = "https://sci-hub.se/"+number

            webbrowser.open_new(url_sci_hub)

print('count:', count)