3 odpowiedzi

Najlepsza odpowiedź

Aby zeskrobać wszystkie strony, obserwuj ten spustowy parametr w przyrostach URL przez 2, a nie 1. W ten sposób poniższy kod znajduje maksymalną stronę na liście, wielokrotność tego ostatniego wyniku przez 2 i wykorzystuje wynik jako zakres:

import requests, re, contextlib
from bs4 import BeautifulSoup as soup
import csv

@contextlib.contextmanager
def scrape_table(url):
 d = soup(requests.get(url).text, 'html.parser')
 headers = [i.text for i in d.find_all('td', {'class':re.compile('table\-top')})]
 full_table = [i.text for i in d.find_all('td', {'class':re.compile('screener-body-table-nw')})]
 grouped_table = [full_table[i:i+len(headers)] for i in range(0, len(full_table), len(headers))]
 yield [dict(zip(headers, i)) for i in grouped_table]

start = 'https://www.finviz.com/screener.ashx?v=161'
max_link = int(max(soup(requests.get(start).text, 'html.parser').find_all('a', {'class':'screener-pages'}), key=lambda x:int(x.text)).text)
headers = [i.text for i in soup(requests.get(start).text, 'html.parser').find_all('td', {'class':re.compile('table\-top')})]
with open('filename.csv', 'a') as f:
 write = csv.writer(f)
 write.writerow(headers)
 with scrape_table(start) as r1:
  write.writerows(list(filter(None, [[i[b] for b in headers] for i in r1])))
 for i in range(1, max_link):
  url = 'https://www.finviz.com/screener.ashx?v=161&r={}1'.format(i*2)
  with scrape_table(url) as result:
   _r = list(filter(None, [[i[b] for b in headers] for i in result]))
   if _r:
    write.writerows(_r)

Wynik (pierwsza strona):

{'Profit M': '4.60%', 'Ticker': 'ABAC', 'Price': '2.17', 'ROI': '1.10%', 'Quick R': '17.40', 'Market Cap': '17.60M', 'Curr R': '19.00', 'Gross M': '16.30%', 'ROA': '1.40%', 'Dividend': '-', 'Earnings': 'Apr 02/a', 'LTDebt/Eq': '0.00', 'No.': '21', 'Volume': '1,744', 'ROE': '1.40%', 'Debt/Eq': '0.02', 'Change': '-1.36%', 'Oper M': '4.20%'}, {'Profit M': '11.10%', 'Ticker': 'ABAX', 'Price': '82.74', 'ROI': '8.90%', 'Quick R': '5.00', 'Market Cap': '1.88B', 'Curr R': '6.00', 'Gross M': '54.60%', 'ROA': '8.30%', 'Dividend': '0.87%', 'Earnings': 'Apr 26/a', 'LTDebt/Eq': '0.00', 'No.': '22', 'Volume': '253,661', 'ROE': '9.70%', 'Debt/Eq': '0.00', 'Change': '-0.07%', 'Oper M': '15.80%'}, {'Profit M': '5.90%', 'Ticker': 'ABB', 'Price': '23.23', 'ROI': '11.50%', 'Quick R': '0.90', 'Market Cap': '48.87B', 'Curr R': '1.20', 'Gross M': '30.40%', 'ROA': '4.90%', 'Dividend': '3.57%', 'Earnings': 'Apr 19/b', 'LTDebt/Eq': '0.39', 'No.': '23', 'Volume': '1,371,355', 'ROE': '14.80%', 'Debt/Eq': '0.58', 'Change': '2.15%', 'Oper M': '9.40%'}, {'Profit M': '21.50%', 'Ticker': 'ABBV', 'Price': '98.05', 'ROI': '25.40%', 'Quick R': '1.10', 'Market Cap': '157.01B', 'Curr R': '1.20', 'Gross M': '75.80%', 'ROA': '9.20%', 'Dividend': '3.92%', 'Earnings': 'Apr 26/b', 'LTDebt/Eq': '8.70', 'No.': '24', 'Volume': '14,832,723', 'ROE': '119.30%', 'Debt/Eq': '10.49', 'Change': '-0.90%', 'Oper M': '34.00%'}, {'Profit M': '0.50%', 'Ticker': 'ABC', 'Price': '83.34', 'ROI': '8.70%', 'Quick R': '0.50', 'Market Cap': '18.05B', 'Curr R': '0.90', 'Gross M': '2.90%', 'ROA': '2.40%', 'Dividend': '1.82%', 'Earnings': 'May 02/b', 'LTDebt/Eq': '1.46', 'No.': '25', 'Volume': '1,020,497', 'ROE': '32.00%', 'Debt/Eq': '1.53', 'Change': '1.46%', 'Oper M': '0.60%'}
0
Ajax1234 3 czerwiec 2018, 16:28

To jest kod, którego myślę, że mi dałeś, ale prawdopodobnie popełniłem błąd.

import requests
from bs4 import BeautifulSoup

def get_rows(r=0):
  base_url = 'https://www.finviz.com/screener.ashx?v=161'
  if r > 0:
    base_url+="&r=" + str(r) 
  html = requests.get(base_url)
  soup = BeautifulSoup(html.content, "html.parser")
  main_div = soup.find('div', attrs = {'id':'screener-content'})

  light_rows = main_div.find_all('tr', class_="table-light-row-cp")
  dark_rows = main_div.find_all('tr', class_="table-dark-row-cp")
  return light_rows, dark_rows
data = []
for r in range(0, 43, 21):
  print("getting r={0}".format(r))
  light_rows, dark_rows=get_rows(r) 
  for rows_set in (light_rows, dark_rows):
    pass
  for row in rows_set:
    row_data = []
    for cell in row.find_all('td'):
      val = cell.a.get_text()
      row_data.append(val)
    data.append(row_data)

#  sort rows to maintain original order
data.sort(key=lambda x: int(x[0]))

import pandas
pandas.DataFrame(data).to_csv("AAA.csv", header=False)

Są to komunikaty o błędach, które otrzymuję.

>>> import requests
>>> from bs4 import BeautifulSoup
>>>
>>> def get_rows(r=0):
...   base_url = 'https://www.finviz.com/screener.ashx?v=161'
...   if r > 0:
...     base_url+="&r=" + str(r)
...   html = requests.get(base_url)
...   soup = BeautifulSoup(html.content, "html.parser")
...   main_div = soup.find('div', attrs = {'id':'screener-content'})
...
>>>   light_rows = main_div.find_all('tr', class_="table-light-row-cp")
 File "<stdin>", line 1
  light_rows = main_div.find_all('tr', class_="table-light-row-cp")
  ^
IndentationError: unexpected indent
>>>   dark_rows = main_div.find_all('tr', class_="table-dark-row-cp")
 File "<stdin>", line 1
  dark_rows = main_div.find_all('tr', class_="table-dark-row-cp")
  ^
IndentationError: unexpected indent
>>>   return light_rows, dark_rows
 File "<stdin>", line 1
  return light_rows, dark_rows
  ^
IndentationError: unexpected indent
>>> data = []
>>> for r in range(0, 43, 21):
...   print("getting r={0}".format(r))
...   light_rows, dark_rows=get_rows(r)
...   for rows_set in (light_rows, dark_rows):
...     pass
...   for row in rows_set:
...     row_data = []
...     for cell in row.find_all('td'):
...       val = cell.a.get_text()
...       row_data.append(val)
...     data.append(row_data)
...
getting r=0
Traceback (most recent call last):
 File "<stdin>", line 3, in <module>
TypeError: 'NoneType' object is not iterable
>>> #  sort rows to maintain original order
... data.sort(key=lambda x: int(x[0]))
>>>
>>> import pandas
>>> pandas.DataFrame(data).to_csv("AAA.csv", header=False)
-2
J R 3 czerwiec 2018, 15:09

Zacznij od refaktoryzacji funkcji, aby uzyskać dane dla dowolnego adresu URL z tej strony

def get_rows(r=0):
  base_url = 'https://www.finviz.com/screener.ashx?v=161'
  if r > 0:
    base_url+="&r=" + str(r) 

  html = requests.get(base_url)
  soup = BeautifulSoup(html.content, "html.parser")
  main_div = soup.find('div', attrs = {'id':'screener-content'})

  light_rows = main_div.find_all('tr', class_="table-light-row-cp")
  dark_rows = main_div.find_all('tr', class_="table-dark-row-cp")
  return light_rows, dark_rows

Następnie pętla na niektóre wartości R

data = []
for r in range(0, 43, 21):
  print("getting r={0}".format(r))
  light_rows, dark_rows=get_rows(r) 
  for rows_set in (light_rows, dark_rows):
    for row in rows_set:
      row_data = [cell.a.get_text() for cell in row.find_all('td')] 
      data.append(row_data)


#  sort rows to maintain original order
data.sort(key=lambda x: int(x[0]))
0
OneCricketeer 3 czerwiec 2018, 14:49