2015年2月16日星期一

Extract Online Data by Python

Notebook

Web Scriping

Goal

  • Extracting information of the top 100 company in the world

Data Extraction

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
from pattern import web
import re
import numpy as np
%matplotlib inline 

url = 'http://www.forbes.com/global2000/list/#page:1_sort:0_direction:asc_search:_filter:All%20industries_filter:All%20countries_filter:All%20states'
html = requests.get(url).text
#print html
dom = web.Element(html)
    

def _strip(s):
    '''
    taken string and remove all whitespace
    
    '''
    return re.sub(r' ','', s)

def get_href(href):
    return'http://www.forbes.com'+_strip(str(href))


#print dom.by_tag('table')[0].content
In [4]:
for d in dom.by_tag('table'):
    #print d
    from collections import defaultdict
    result = defaultdict(list)
    header=d.by_tag('tr')[0]
    headers=[ str(h('a')[0].content) for h in header('th')]
    headers.insert(1,'Forbes Link')
    #['Rank', 'Forbes Link', 'Company', 'Country', 'Sales', 'Profits', 'Assets', 'Market Value']
    #print headers
    rows=d.by_tag('tr')[1:]
    for row in rows:
        rank=int(row('td.rank')[0].content)
        result[headers[0]].append(rank)
        link=row('td.company')[0].content.split('<a href=')[1].split('>')[0].split('"')[1]
        flink=get_href(link)
        result[headers[1]].append(flink)
        company=row('td.company')[0].by_tag('h3')[0].content
        country=row('td')[-5].content
        result[headers[2]].append(company)
        result[headers[3]].append(country)
        for i in range(4):
            result[headers[i+4]].append(float(re.sub(',','',_strip(row('td.nowrap')
                                                            [i].content).strip('$').strip('B'))))
   
    result=pd.DataFrame(result)

    print result        
    Assets                     Company         Country  \
0   3124.9                        ICBC           China   
1   2449.5     China Construction Bank           China   
2   2405.4  Agricultural Bank of China           China   
3   2435.3              JPMorgan Chase   United States   
4    493.4          Berkshire Hathaway   United States   
5    346.8                 Exxon Mobil   United States   
6    656.6            General Electric   United States   
7   1543.0                 Wells Fargo   United States   
8   2291.8               Bank of China           China   
9    386.9                  PetroChina           China   
10   357.5           Royal Dutch Shell     Netherlands   
11   385.5                Toyota Motor           Japan   
12  2113.8             Bank of America   United States   
13  2671.3               HSBC Holdings  United Kingdom   
14   225.2                       Apple   United States   
15  1883.4                   Citigroup   United States   
16   305.7                          BP  United Kingdom   
17   253.8                     Chevron   United States   
18   446.9            Volkswagen Group         Germany   
19   204.8             Wal-Mart Stores   United States   
20   397.2                     Gazprom          Russia   
21   202.8         Samsung Electronics     South Korea   
22   277.8                       AT&T;   United States   
23  2480.5                 BNP Paribas          France   
24   239.1                       Total          France   
25   274.1      Verizon Communications   United States   
26   963.1                     Allianz         Germany   
27   192.8                China Mobile       Hong Kong   
28   228.4     Sinopec-China Petroleum           China   
29   319.2                   Petrobras          Brazil   
..     ...                         ...             ...   
70   226.2                        Enel           Italy   
71    88.7                        BASF         Germany   
72   156.6                    Softbank           Japan   
73   755.9     National Australia Bank       Australia   
74   659.7                         ANZ       Australia   
75   118.1              ConocoPhillips   United States   
76   815.2               TD Bank Group          Canada   
77    92.4                       Intel   United States   
78  1135.5                         UBS     Switzerland   
79   105.0             Hewlett-Packard   United States   
80    91.3                   Coca-Cola   United States   
81    98.4               Cisco Systems   United States   
82   109.4                      LukOil          Russia   
83    81.9          UnitedHealth Group   United States   
84    92.7                      Boeing   United States   
85   397.1      Zurich Insurance Group     Switzerland   
86   126.4               Hyundai Motor     South Korea   
87   132.4                      Sanofi          France   
88  2117.7             Credit Agricole          France   
89    90.6         United Technologies   United States   
90    69.9               Roche Holding     Switzerland   
91   343.1                   Munich Re         Germany   
92    77.5                     PepsiCo   United States   
93    86.6                      Oracle   United States   
94   702.1         Bank of Nova Scotia          Canada   
95    71.5                CVS Caremark   United States   
96  1488.7                   ING Group     Netherlands   
97    90.4      Saudi Basic Industries    Saudi Arabia   
98   105.6              Merck &amp; Co   United States   
99    83.2                 Walt Disney   United States   

                                          Forbes Link  Market Value  Profits  \
0               http://www.forbes.com/companies/icbc/         215.6     42.7   
1   http://www.forbes.com/companies/china-construc...         174.4     34.2   
2   http://www.forbes.com/companies/agricultural-b...         141.1     27.0   
3     http://www.forbes.com/companies/jpmorgan-chase/         229.7     17.3   
4   http://www.forbes.com/companies/berkshire-hath...         309.1     19.5   
5        http://www.forbes.com/companies/exxon-mobil/         422.3     32.6   
6   http://www.forbes.com/companies/general-electric/         259.6     14.8   
7        http://www.forbes.com/companies/wells-fargo/         261.4     21.9   
8      http://www.forbes.com/companies/bank-of-china/         124.2     25.5   
9         http://www.forbes.com/companies/petrochina/         202.0     21.1   
10  http://www.forbes.com/companies/royal-dutch-sh...         234.1     16.4   
11      http://www.forbes.com/companies/toyota-motor/         193.5     18.8   
12   http://www.forbes.com/companies/bank-of-america/         183.3     11.4   
13     http://www.forbes.com/companies/hsbc-holdings/         192.6     16.3   
14             http://www.forbes.com/companies/apple/         483.1     37.0   
15         http://www.forbes.com/companies/citigroup/         145.1     13.4   
16                http://www.forbes.com/companies/bp/         148.8     23.6   
17           http://www.forbes.com/companies/chevron/         227.2     21.4   
18  http://www.forbes.com/companies/volkswagen-group/         119.0     12.0   
19   http://www.forbes.com/companies/wal-mart-stores/         247.9     16.0   
20           http://www.forbes.com/companies/gazprom/          88.8     39.0   
21  http://www.forbes.com/companies/samsung-electr...         186.5     27.2   
22               http://www.forbes.com/companies/att/         182.7     18.2   
23       http://www.forbes.com/companies/bnp-paribas/          98.6      6.4   
24             http://www.forbes.com/companies/total/         149.8     11.2   
25  http://www.forbes.com/companies/verizon-commun...         197.7     11.5   
26           http://www.forbes.com/companies/allianz/          77.2      8.0   
27      http://www.forbes.com/companies/china-mobile/         184.6     19.8   
28  http://www.forbes.com/companies/sinopec-china-...          94.7     10.9   
29         http://www.forbes.com/companies/petrobras/          86.8     10.9   
..                                                ...           ...      ...   
70              http://www.forbes.com/companies/enel/          53.2      4.3   
71              http://www.forbes.com/companies/basf/         102.3      6.4   
72          http://www.forbes.com/companies/softbank/          91.2      5.8   
73  http://www.forbes.com/companies/national-austr...          75.3      5.4   
74               http://www.forbes.com/companies/anz/          83.9      6.2   
75    http://www.forbes.com/companies/conocophillips/          86.3      9.2   
76     http://www.forbes.com/companies/td-bank-group/          86.2      6.6   
77             http://www.forbes.com/companies/intel/         129.2      9.6   
78               http://www.forbes.com/companies/ubs/          81.0      3.4   
79   http://www.forbes.com/companies/hewlett-packard/          63.0      5.3   
80         http://www.forbes.com/companies/coca-cola/         168.7      8.5   
81     http://www.forbes.com/companies/cisco-systems/         119.0      8.2   
82            http://www.forbes.com/companies/lukoil/          47.7      7.8   
83  http://www.forbes.com/companies/unitedhealth-g...          81.0      5.6   
84            http://www.forbes.com/companies/boeing/          95.3      4.6   
85  http://www.forbes.com/companies/zurich-insuran...          45.8      4.0   
86     http://www.forbes.com/companies/hyundai-motor/          49.7      7.8   
87            http://www.forbes.com/companies/sanofi/         137.1      4.9   
88   http://www.forbes.com/companies/credit-agricole/          41.0      3.3   
89  http://www.forbes.com/companies/united-technol...         108.1      5.7   
90     http://www.forbes.com/companies/roche-holding/         253.7     12.0   
91         http://www.forbes.com/companies/munich-re/          38.9      4.4   
92           http://www.forbes.com/companies/pepsico/         126.2      6.7   
93            http://www.forbes.com/companies/oracle/         185.0     11.1   
94  http://www.forbes.com/companies/bank-of-nova-s...          71.2      6.3   
95      http://www.forbes.com/companies/cvs-caremark/          87.8      4.6   
96         http://www.forbes.com/companies/ing-group/          56.1      4.4   
97  http://www.forbes.com/companies/saudi-basic-in...          94.4      6.7   
98          http://www.forbes.com/companies/merck-co/         165.8      4.4   
99       http://www.forbes.com/companies/walt-disney/         142.9      6.6   

    Rank  Sales  
0      1  148.7  
1      2  121.3  
2      3  136.4  
3      4  105.7  
4      5  178.8  
5      6  394.0  
6      7  143.3  
7      8   88.7  
8      9  105.1  
9     10  328.5  
10    11  451.4  
11    12  255.6  
12    13  101.5  
13    14   79.6  
14    15  173.8  
15    16   94.1  
16    17  379.2  
17    18  211.8  
18    19  261.5  
19    20  476.5  
20    21  164.6  
21    22  208.9  
22    23  128.8  
23    24  123.2  
24    25  227.9  
25    26  120.6  
26    27  131.4  
27    28  102.5  
28    29  445.3  
29    30  141.2  
..   ...    ...  
70    71  106.3  
71    72   98.2  
72    73   55.6  
73    74   36.9  
74    75   34.0  
75    76   55.6  
76    76   31.3  
77    78   52.7  
78    79   39.7  
79    80  112.1  
80    81   46.3  
81    82   47.9  
82    83  119.2  
83    84  122.5  
84    84   86.6  
85    84   71.9  
86    87   79.8  
87    87   43.7  
88    89   65.3  
89    90   62.7  
90    90   50.5  
91    92   88.0  
92    93   66.4  
93    94   37.9  
94    95   27.7  
95    96  126.8  
96    97   34.5  
97    98   50.4  
98    99   44.1  
99   100   46.0  

[100 rows x 8 columns]

Analysis

  • Top 100 company distribution over country: cross tab
In [6]:
result_byc=result.groupby('Country')
result_byc.describe()
count=result_byc.count()['Assets']
#help(result_byc)
length=len(count)
ran=np.array(range(length))
plt.plot(ran,count,'D')
#help(plt.plot)
Out[6]:
[<matplotlib.lines.Line2D at 0x1087b6fd0>]
In []:
 

没有评论:

发表评论