stock_tickers, stock_names, prices = get_price_rank()
print(stock_tickers)
print(stock_names)
print(prices)
['3008', '6415', '5269', '6409', '6669', '2207', '3406', '1590', '2454', '2327', '3563', '1476', '2059', '3533', '2330', '2049', '2395', '2912', '8454', '2227', '8464', '2379', '3443', '6414', '2474', '3661', '2231', '4137', '1256', '6230', '8341', '2357', '3665', '2492', '3034', '3653', '6452', '2404', '8462', '3413', '6670', '9921', '1707', '9910', '4551', '4439', '9914', '8070', '4968', '6666', '2345', '6491', '2707', '8016', '6271', '2439', '6451', '2360', '4943', '8422', '3130', '1477', '1537', '8482', '2308', '3376', '4766', '4935', '4438', '6213', '6504', '9802', '5871', '4536', '3023', '8480', '2383', '4763', '4958', '6533', '1558', '2456', '8081', '6706', '3044', '4977', '2723', '4572', '1723', '1232', '1536', '6531', '5288', '6269', '9941', '6464', '4190', '8210', '6176', '6278'] ['大立光', '矽力-KY', '祥碩', '旭隼', '緯穎', '和泰車', '玉晶光', '亞德客-KY', '聯發科', '國巨', '牧德', '儒鴻', '川湖', '嘉澤', '台積電', '上銀', '研華', '統一超', '富邦媒', '裕日車', '億豐', '瑞昱', '創意', '樺漢', '可成', '世芯-KY', '為升', '麗豐-KY', '鮮活果汁-KY', '超眾', '日友', '華碩', '貿聯-KY', '華新科', '聯詠', '健策', '康友-KY', '漢唐', '柏文', '京鼎', '復盛應用', '巨大', '葡萄王', '豐泰', '智伸科', '冠星-KY', '美利達', '長華', '立積', '羅麗芬-KY', '智邦', '晶碩', '晶華', '矽創', '同欣電', '美律', '訊芯-KY', '致茂', '康控-KY', '可寧衛', '一零四', '聚陽', '廣隆', '商億-KY', '台達電', '新日興', '南寶', '茂林-KY', '廣越', '聯茂', '南六', '鈺齊-KY', '中租-KY', '拓凱', '信邦', '泰昇-KY', '台光電', '材料-KY', '臻鼎-KY', '晶心科', '伸興', '奇力新', '致新', '惠特', '健鼎', '眾達-KY', '美食-KY', '駐龍', '中碳', '大統益', '和大', '愛普', '豐祥-KY', '台郡', '裕融', '台數科', '佐登-KY', '勤誠', '瑞儀', '台表科'] [4975.0, 1105.0, 818.0, 740.0, 721.0, 702.0, 590.0, 520.0, 419.0, 415.5, 414.5, 402.5, 360.0, 343.5, 333.0, 330.5, 314.5, 308.0, 303.0, 287.5, 285.0, 260.5, 260.0, 255.5, 254.5, 253.0, 252.5, 246.0, 240.0, 238.0, 238.0, 236.0, 232.0, 232.0, 230.5, 226.0, 223.5, 198.0, 195.0, 194.5, 194.5, 194.0, 193.5, 190.0, 189.0, 181.0, 176.0, 175.0, 174.0, 174.0, 174.0, 172.5, 167.5, 167.0, 166.5, 165.0, 159.0, 158.5, 158.5, 157.5, 156.5, 155.0, 153.0, 152.5, 152.5, 152.0, 151.5, 148.0, 147.5, 147.0, 146.5, 146.0, 142.5, 141.0, 138.0, 136.5, 136.0, 135.0, 134.5, 133.5, 133.0, 133.0, 129.0, 129.0, 128.0, 126.5, 126.5, 125.5, 124.0, 123.5, 122.5, 122.0, 121.0, 118.5, 118.0, 117.5, 117.0, 116.0, 115.0, 114.5]
from statistics import median
ky_prices = [price for stock_name, price in zip(stock_names, prices) if "KY" in stock_name]
print(median(ky_prices))
155.5
pandas
的開發有助於補足 Python 資料分析的需求,讓使用者能夠在 Python 中執行完整的資料分析流程,而無需切換到 data-centric 的特定語言,如 R。¶import pandas as pd
df = pd.DataFrame()
df["ticker"] = stock_tickers
df["stock_name"] = stock_names
df["price"] = prices
df.head()
ticker | stock_name | price | |
---|---|---|---|
0 | 3008 | 大立光 | 4975.0 |
1 | 6415 | 矽力-KY | 1105.0 |
2 | 5269 | 祥碩 | 818.0 |
3 | 6409 | 旭隼 | 740.0 |
4 | 6669 | 緯穎 | 721.0 |
df[df["stock_name"].str.contains("KY")]["price"].median()
155.5
Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more.
# run in bash shell
pip install pandas
# run in python console
import pandas as pd
使用 pd.Series()
函數創建 Series 類別,Series 從 ndarray 繼承了所有特性,並加上一組 Index。
import pandas as pd
movie_ratings = [8.0, 7.3, 8.5, 8.6]
ser = pd.Series(movie_ratings)
print(type(ser))
print(ser)
print(ser[3])
<class 'pandas.core.series.Series'> 0 8.0 1 7.3 2 8.5 3 8.6 dtype: float64 8.6
movie_ratings = [8.0, 7.3, 8.5, 8.6]
movie_titles = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War", "Avengers: Endgame"]
ser = pd.Series(movie_ratings, index=movie_titles)
print(ser)
print(ser["Avengers: Endgame"])
The Avengers 8.0 Avengers: Age of Ultron 7.3 Avengers: Infinity War 8.5 Avengers: Endgame 8.6 dtype: float64 8.6
movie_ratings = [8.0, 7.3, 8.5, 8.6]
movie_titles = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War", "Avengers: Endgame"]
ser = pd.Series(movie_ratings, index=movie_titles)
print(ser.index)
print(ser.values)
print(type(ser.index))
print(type(ser.values))
Index(['The Avengers', 'Avengers: Age of Ultron', 'Avengers: Infinity War', 'Avengers: Endgame'], dtype='object') [8. 7.3 8.5 8.6] <class 'pandas.core.indexes.base.Index'> <class 'numpy.ndarray'>
我們可以使用 pd.DataFrame()
函數創建 DataFrame 類別。
movie_ratings = [8.0, 7.3, 8.5, 8.6]
movie_titles = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War", "Avengers: Endgame"]
df = pd.DataFrame()
df["title"] = movie_titles
df["rating"] = movie_ratings
print(type(df))
df
<class 'pandas.core.frame.DataFrame'>
title | rating | |
---|---|---|
0 | The Avengers | 8.0 |
1 | Avengers: Age of Ultron | 7.3 |
2 | Avengers: Infinity War | 8.5 |
3 | Avengers: Endgame | 8.6 |
movie_ratings = [8.0, 7.3, 8.5, 8.6]
movie_titles = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War", "Avengers: Endgame"]
df = pd.DataFrame()
df["title"] = movie_titles
df["rating"] = movie_ratings
print(df.index)
print(df.columns)
print(df["title"])
print(df["rating"])
RangeIndex(start=0, stop=4, step=1) Index(['title', 'rating'], dtype='object') 0 The Avengers 1 Avengers: Age of Ultron 2 Avengers: Infinity War 3 Avengers: Endgame Name: title, dtype: object 0 8.0 1 7.3 2 8.5 3 8.6 Name: rating, dtype: float64
不論是 Series 或 DataFrame 物件都包含一個 Index 類別,作為萃取以及更新資料的根據,Index 可以被視為是一種結合了 tuple 的不可變(Immutable)特性以及 set 集合運算特性的資料結構類別,我們可以使用 pd.Index()
函數創建出下列的範例
pd_index = pd.Index([0, 2, 3, 4])
print(type(pd_index))
print(pd_index)
pd_index[0] = 1
<class 'pandas.core.indexes.numeric.Int64Index'> Int64Index([0, 2, 3, 4], dtype='int64')
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-11-b5275c789408> in <module> 2 print(type(pd_index)) 3 print(pd_index) ----> 4 pd_index[0] = 1 ~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value) 4258 4259 def __setitem__(self, key, value): -> 4260 raise TypeError("Index does not support mutable operations") 4261 4262 def __getitem__(self, key): TypeError: Index does not support mutable operations
odds_index = pd.Index([1, 3, 5, 7, 9])
primes_index = pd.Index([2, 3, 5, 7])
print(odds_index & primes_index) # and
print(odds_index | primes_index) # or
print(odds_index ^ primes_index) # exclusive or
Int64Index([3, 5, 7], dtype='int64') Int64Index([1, 2, 3, 5, 7, 9], dtype='int64') Int64Index([1, 2, 9], dtype='int64')
pd.Series()
函數傳入一個「類似清單(list-like)」的物件,包含 list 、tuple 或 ndarray,如果沒有指定另外一個「類似清單」的物件作為索引,Series 會自動以類似 range()
函數設定對應長度的索引¶movie_ratings = [9.0, 8.9, 8.8, 8.7]
ser = pd.Series(movie_ratings)
print(ser)
0 9.0 1 8.9 2 8.8 3 8.7 dtype: float64
pd.Series()
函數中加入 index 參數指定對應長度的索引¶movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
ser = pd.Series(movie_ratings, index=movie_titles)
print(ser.index)
print(ser.values)
print(ser)
Index(['The Dark Knight', 'Schindler's List', 'Forrest Gump', 'Inception'], dtype='object') [9. 8.9 8.8 8.7] The Dark Knight 9.0 Schindler's List 8.9 Forrest Gump 8.8 Inception 8.7 dtype: float64
pd.Series()
函數中傳入 dict,如此一來字典中的鍵(Keys)會被記錄成為索引、字典中的值(Values)則會被記錄成為陣列中的資料¶movie_dict = {
"The Dark Knight": 9.0,
"Schindler's List": 8.9,
"Forrest Gump": 8.8,
"Inception": 8.7
}
ser = pd.Series(movie_dict)
print(movie_dict.keys())
print(movie_dict.values())
print("\n")
print(ser.index)
print(ser.values)
print(ser)
dict_keys(['The Dark Knight', "Schindler's List", 'Forrest Gump', 'Inception']) dict_values([9.0, 8.9, 8.8, 8.7]) Index(['The Dark Knight', 'Schindler's List', 'Forrest Gump', 'Inception'], dtype='object') [9. 8.9 8.8 8.7] The Dark Knight 9.0 Schindler's List 8.9 Forrest Gump 8.8 Inception 8.7 dtype: float64
pd.DataFrame()
函數傳入多個「類似清單(list-like)」的物件,包含 list 、tuple 或 ndarray,並視需求指定變數名稱或索引;同樣地,若是在沒有指定索引的情況下,DataFrame 會自動以類似 range()
函數設定對應長度的索引¶movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
df = pd.DataFrame()
df["title"] = movie_titles
df["rating"] = movie_ratings
df
title | rating | |
---|---|---|
0 | The Dark Knight | 9.0 |
1 | Schindler's List | 8.9 |
2 | Forrest Gump | 8.8 |
3 | Inception | 8.7 |
pd.DataFrame()
函數中傳入 dict,如此一來字典中的鍵(Keys)會被記錄成為變數名稱、字典中的值(Values)則會被記錄成為陣列中的資料¶movie_dict = {
"title": ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"],
"rating": [9.0, 8.9, 8.8, 8.7]
}
df = pd.DataFrame(movie_dict)
df
title | rating | |
---|---|---|
0 | The Dark Knight | 9.0 |
1 | Schindler's List | 8.9 |
2 | Forrest Gump | 8.8 |
3 | Inception | 8.7 |
pd.read_csv()
函數讀入以逗號區隔變數的 CSV 文字檔¶df = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/movies.csv")
df
title | rating | |
---|---|---|
0 | The Dark Knight | 9.0 |
1 | Schindler's List | 8.9 |
2 | Forrest Gump | 8.8 |
3 | Inception | 8.7 |
pd.read_json()
函數讀入以 JSON 所組成的陣列文字檔¶df = pd.read_json("https://python4ds.s3-ap-northeast-1.amazonaws.com/movies.json")
df
title | rating | |
---|---|---|
0 | The Dark Knight | 9.0 |
1 | Schindler's List | 8.9 |
2 | Forrest Gump | 8.8 |
3 | Inception | 8.7 |
pd.read_excel()
函數讀入試算表¶df = pd.read_excel("https://python4ds.s3-ap-northeast-1.amazonaws.com/movies.xlsx")
df
title | rating | |
---|---|---|
0 | The Dark Knight | 9.0 |
1 | Schindler's List | 8.9 |
2 | Forrest Gump | 8.8 |
3 | Inception | 8.7 |
pd.read_sql()
函數讀入資料庫中的表格¶import sqlite3
# Creating a demo.db database in working directory
conn = sqlite3.connect('demo.db')
# Importing a table
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
df = pd.DataFrame()
df["title"] = movie_titles
df["rating"] = movie_ratings
df.to_sql("movies", index=False, con=conn, if_exists='replace')
# Importing data from demo.movies
query_str = """
SELECT *
FROM movies
WHERE rating < 9.0;
"""
pd.read_sql(query_str, con=conn)
title | rating | |
---|---|---|
0 | Schindler's List | 8.9 |
1 | Forrest Gump | 8.8 |
2 | Inception | 8.7 |
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
ser = pd.Series(movie_ratings, index=movie_titles)
print(ser)
print(ser[0])
print(ser["Forrest Gump"])
The Dark Knight 9.0 Schindler's List 8.9 Forrest Gump 8.8 Inception 8.7 dtype: float64 9.0 8.8
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
ser = pd.Series(movie_ratings, index=movie_titles)
print(ser[1:4])
print(ser["Schindler's List":"Inception"])
Schindler's List 8.9 Forrest Gump 8.8 Inception 8.7 dtype: float64 Schindler's List 8.9 Forrest Gump 8.8 Inception 8.7 dtype: float64
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
ser = pd.Series(movie_ratings, index=movie_titles)
# Fancy indexing
print(ser[[1, 2, 3]])
print(ser[["Schindler's List", "Forrest Gump", "Inception"]])
# Boolean indexing
print(ser < 9)
print(ser[ser < 9])
Schindler's List 8.9 Forrest Gump 8.8 Inception 8.7 dtype: float64 Schindler's List 8.9 Forrest Gump 8.8 Inception 8.7 dtype: float64 The Dark Knight False Schindler's List True Forrest Gump True Inception True dtype: bool Schindler's List 8.9 Forrest Gump 8.8 Inception 8.7 dtype: float64
利用 [COLUMN]
或 .COLUMN
能夠從資料框中選擇出單一或多個變數,成為一個 Series 或者變數欄位較少的資料框子集,實踐 SQL 語法中的 SELECT
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
release_years = [2008, 1993, 1994, 2010]
df = pd.DataFrame()
df["title"] = movie_titles
df["rating"] = movie_ratings
df["release_year"] = release_years
print(df["title"])
print(df.rating)
df[["title", "release_year"]]
0 The Dark Knight 1 Schindler's List 2 Forrest Gump 3 Inception Name: title, dtype: object 0 9.0 1 8.9 2 8.8 3 8.7 Name: rating, dtype: float64
title | release_year | |
---|---|---|
0 | The Dark Knight | 2008 |
1 | Schindler's List | 1993 |
2 | Forrest Gump | 1994 |
3 | Inception | 2010 |
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
release_years = [2008, 1993, 1994, 2010]
df = pd.DataFrame()
df["title"] = movie_titles
df["rating"] = movie_ratings
df["release_year"] = release_years
print(df["release_year"] > 2000)
df[df["release_year"] > 2000]
0 True 1 False 2 False 3 True Name: release_year, dtype: bool
title | rating | release_year | |
---|---|---|---|
0 | The Dark Knight | 9.0 | 2008 |
3 | Inception | 8.7 | 2010 |
.loc[ROW_LABEL, COLUMN_LABEL]
:純粹以列、欄標籤為準進行選擇跟篩選.iloc[ROW_INDEX, COLUMN_INDEX]
:純粹以資料的整數位置(integer location)為準進行選擇跟篩選movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
release_years = [2008, 1993, 1994, 2010]
df = pd.DataFrame()
df["rating"] = movie_ratings
df["release_year"] = release_years
df.index = movie_titles
df
rating | release_year | |
---|---|---|
The Dark Knight | 9.0 | 2008 |
Schindler's List | 8.9 | 1993 |
Forrest Gump | 8.8 | 1994 |
Inception | 8.7 | 2010 |
使用 .loc[]
時必須傳入列標籤(電影名稱)與欄標籤(評等和上映年份)
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
release_years = [2008, 1993, 1994, 2010]
df = pd.DataFrame()
df["rating"] = movie_ratings
df["release_year"] = release_years
df.index = movie_titles
df.loc[["The Dark Knight", "Inception"], ["rating", "release_year"]]
rating | release_year | |
---|---|---|
The Dark Knight | 9.0 | 2008 |
Inception | 8.7 | 2010 |
使用 .iloc[]
時必須傳入 “The Dark Knight” 與 “Inception” 的列整數位置(第 0 與 3 列)與欄整數位置(第 0 與 1 欄)
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
release_years = [2008, 1993, 1994, 2010]
df = pd.DataFrame()
df["rating"] = movie_ratings
df["release_year"] = release_years
df.index = movie_titles
df.iloc[[0, 3], [0, 1]]
rating | release_year | |
---|---|---|
The Dark Knight | 9.0 | 2008 |
Inception | 8.7 | 2010 |
df.sort_index()
:依照資料框的列標籤遞增(預設)或遞減排序df.sort_values()
:依照指定的資料框欄標籤遞增(預設)或遞減排序movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
release_years = [2008, 1993, 1994, 2010]
df = pd.DataFrame()
df["title"] = movie_titles
df["rating"] = movie_ratings
df["release_year"] = release_years
df.sort_index() # default: ascending
title | rating | release_year | |
---|---|---|---|
0 | The Dark Knight | 9.0 | 2008 |
1 | Schindler's List | 8.9 | 1993 |
2 | Forrest Gump | 8.8 | 1994 |
3 | Inception | 8.7 | 2010 |
df.sort_index(ascending=False)
title | rating | release_year | |
---|---|---|---|
3 | Inception | 8.7 | 2010 |
2 | Forrest Gump | 8.8 | 1994 |
1 | Schindler's List | 8.9 | 1993 |
0 | The Dark Knight | 9.0 | 2008 |
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
release_years = [2008, 1993, 1994, 2010]
df = pd.DataFrame()
df["title"] = movie_titles
df["rating"] = movie_ratings
df["release_year"] = release_years
df.sort_values("title")
title | rating | release_year | |
---|---|---|---|
2 | Forrest Gump | 8.8 | 1994 |
3 | Inception | 8.7 | 2010 |
1 | Schindler's List | 8.9 | 1993 |
0 | The Dark Knight | 9.0 | 2008 |
df.sort_values("release_year")
title | rating | release_year | |
---|---|---|---|
1 | Schindler's List | 8.9 | 1993 |
2 | Forrest Gump | 8.8 | 1994 |
0 | The Dark Knight | 9.0 | 2008 |
3 | Inception | 8.7 | 2010 |
df.sort_values("release_year", ascending=False)
title | rating | release_year | |
---|---|---|---|
3 | Inception | 8.7 | 2010 |
0 | The Dark Knight | 9.0 | 2008 |
2 | Forrest Gump | 8.8 | 1994 |
1 | Schindler's List | 8.9 | 1993 |
透過 Series 從 ndarray 繼承而來的元素級別運算(element-wise operation)特性即可實踐,像是運用身高、體重這兩個欄位衍生出球員的 BMI
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
player_profile["bmi"] = player_profile["weightKilograms"] / player_profile["heightMeters"]**2
player_profile[["temporaryDisplayName", "bmi"]].head()
temporaryDisplayName | bmi | |
---|---|---|
0 | Adams, Steven | 26.493861 |
1 | Adebayo, Bam | 26.742788 |
2 | Adel, Deng | 22.449939 |
3 | Aldridge, LaMarcus | 26.481885 |
4 | Alexander, Kyle | 22.416388 |
透過 Series 的 .map()
方法來實踐,傳入 dict 作為對應的準則,字典的鍵(Key)為對應前的原始類別,字典的值(Value)為對應後的類別,例如將本來分類較細膩的鋒衛對應為較粗略的前場、後場
pos_dict = {
"G": "Backcourt",
"F": "Frontcourt",
"C": "Frontcourt",
"G-F": "Backcourt",
"F-C": "Frontcourt",
"F-G": "Frontcourt",
"C-F": "Frontcourt"
}
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
print("Pos before mapping:")
player_profile["pos"].value_counts()
Pos before mapping:
G 200 F 186 C 51 G-F 39 F-C 20 F-G 15 C-F 13 Name: pos, dtype: int64
print("Pos after mapping:")
player_profile["pos_recoded"] = player_profile["pos"].map(pos_dict)
player_profile["pos_recoded"].value_counts()
Pos after mapping:
Frontcourt 285 Backcourt 239 Name: pos_recoded, dtype: int64
透過 pd.cut()
函數將數值變數依照指定的門檻值或箱數切分成為類別變數,舉例來說將身高對應為小於等於 2 公尺以及超過 2 公尺兩個類別
import numpy as np
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
player_profile["heightCategory"] = pd.cut(player_profile["heightMeters"], [0, 2, np.Inf], labels=["<= 2m", "> 2m"])
player_profile[["temporaryDisplayName", "heightMeters", "heightCategory"]].head(10)
temporaryDisplayName | heightMeters | heightCategory | |
---|---|---|---|
0 | Adams, Steven | 2.13 | > 2m |
1 | Adebayo, Bam | 2.08 | > 2m |
2 | Adel, Deng | 2.01 | > 2m |
3 | Aldridge, LaMarcus | 2.11 | > 2m |
4 | Alexander, Kyle | 2.11 | > 2m |
5 | Alexander-Walker, Nickeil | 1.96 | <= 2m |
6 | Allen, Grayson | 1.96 | <= 2m |
7 | Allen, Jarrett | 2.11 | > 2m |
8 | Allen, Kadeem | 1.90 | <= 2m |
9 | Aminu, Al-Farouq | 2.06 | > 2m |
透過 .apply()
方法來實踐,傳入函數或 lambda 表示式作為映射的準則,例如將本來分類較細膩的鋒衛對應為較粗略的 G、F 與 C
def recode_pos(x):
if x[0] == 'G':
return 'G'
elif x[0] == 'F':
return 'F'
elif x[0] == 'C':
return 'C'
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
player_profile["pos_recoded"] = player_profile["pos"].apply(recode_pos)
print("Pos before applying:")
player_profile["pos"].value_counts()
Pos before applying:
G 200 F 186 C 51 G-F 39 F-C 20 F-G 15 C-F 13 Name: pos, dtype: int64
print("Pos after applying:")
player_profile["pos_recoded"].value_counts()
Pos after applying:
G 239 F 221 C 64 Name: pos_recoded, dtype: int64
對資料框呼叫常用的摘要方法
.count()
計算列數.mean()
與 .median()
計算平均和中位數.min()
與 .max()
計算最小和最大值.std()
與 .var()
計算標準差和變異數.prod()
計算乘積.sum()
計算總和movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
release_years = [2008, 1993, 1994, 2010]
movie_length_mins = [152, 195, 142, 148]
df = pd.DataFrame()
df["rating"] = movie_ratings
df["release_year"] = release_years
df["movie_length_mins"] = movie_length_mins
df.index = movie_titles
print(df.max())
print("\n")
print(df.max(axis=1))
rating 9.0 release_year 2010.0 movie_length_mins 195.0 dtype: float64 The Dark Knight 2008.0 Schindler's List 1993.0 Forrest Gump 1994.0 Inception 2010.0 dtype: float64
df["col_name"].max()
df["col_name"].min()
df["col_name"].idxmax()
df["col_name"].idxmin()
import pandas as pd
numbers = [9, 23, 33, 91, 13]
players = ["Ron Harper", "Michael Jordan", "Scottie Pippen", "Dennis Rodman", "Luc Longley"]
df = pd.DataFrame()
df["number"] = numbers
df["player"] = players
df
number | player | |
---|---|---|
0 | 9 | Ron Harper |
1 | 23 | Michael Jordan |
2 | 33 | Scottie Pippen |
3 | 91 | Dennis Rodman |
4 | 13 | Luc Longley |
max_number = df["number"].max()
df[df["number"] == max_number]["player"][3]
'Dennis Rodman'
df.set_index("player")["number"].idxmax()
'Dennis Rodman'
min_number = df["number"].min()
df[df["number"] == min_number]["player"][0]
'Ron Harper'
df.set_index("player")["number"].idxmin()
'Ron Harper'
df["col_name"].nunique()
df["col_name"].unique()
df["col_name"].value_counts()
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
print(player_profile["country"].size)
print(player_profile["country"].nunique())
print(player_profile["country"].unique())
print(player_profile["country"].value_counts())
524 42 ['New Zealand' 'USA' 'South Sudan' 'Canada' 'Greece' 'United Kingdom' 'Bahamas' 'France' 'Australia' 'Italy' 'Croatia' 'Latvia' 'Georgia' 'Democratic Republic of the Congo' 'Serbia' 'Sudan' 'Germany' 'Lithuania' 'Brazil' 'Slovenia' 'Switzerland' nan 'Mali' 'Senegal' 'Guinea' 'Cameroon' 'Angola' ' ' 'Spain' 'Yugoslavia' 'Japan' 'Dominican Republic' 'Turkey' 'Haiti' 'Ukraine' 'Finland' 'Bosnia and Herzegovina' 'Egypt' 'Nigeria' 'Austria' 'Czech Republic' 'Gabon' 'Montenegro'] USA 393 Canada 20 France 12 Australia 10 Croatia 7 Germany 6 Serbia 6 Spain 5 Lithuania 4 Turkey 4 Brazil 3 Slovenia 3 Latvia 3 Democratic Republic of the Congo 3 Italy 3 Greece 3 Sudan 2 United Kingdom 2 Senegal 2 South Sudan 2 Mali 2 Japan 2 Bosnia and Herzegovina 2 Bahamas 2 Cameroon 2 Ukraine 2 Haiti 1 Yugoslavia 1 Guinea 1 New Zealand 1 Switzerland 1 Dominican Republic 1 Montenegro 1 Nigeria 1 Georgia 1 Finland 1 Angola 1 Austria 1 Gabon 1 Egypt 1 1 Czech Republic 1 Name: country, dtype: int64
更多的應用情境中我們會指定一些類別變數分組,在分組的前提下對資料框變數呼叫簡單的摘要方法,這樣的操作源自於 SQL 資料庫查詢語言的 GROUP BY 語法
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
groupby_object = player_profile.groupby("pos")
print(groupby_object["heightMeters"].mean()) # Average height by pos
print("\n")
print(groupby_object["weightKilograms"].mean()) # Average weight by pos
pos C 2.126863 C-F 2.090000 F 2.044378 F-C 2.107500 F-G 1.995333 G 1.924800 G-F 2.000000 Name: heightMeters, dtype: float64 pos C 114.335294 C-F 110.676923 F 101.589730 F-C 110.855000 F-G 95.126667 G 88.896000 G-F 95.717949 Name: weightKilograms, dtype: float64
df.set_index("col_name")
df.reset_index()
movie_ratings = [9.0, 8.9, 8.8, 8.7]
movie_titles = ["The Dark Knight", "Schindler's List", "Forrest Gump", "Inception"]
release_years = [2008, 1993, 1994, 2010]
df = pd.DataFrame()
df["title"] = movie_titles
df["rating"] = movie_ratings
df["release_year"] = release_years
df
title | rating | release_year | |
---|---|---|---|
0 | The Dark Knight | 9.0 | 2008 |
1 | Schindler's List | 8.9 | 1993 |
2 | Forrest Gump | 8.8 | 1994 |
3 | Inception | 8.7 | 2010 |
df = df.set_index("title")
df
rating | release_year | |
---|---|---|
title | ||
The Dark Knight | 9.0 | 2008 |
Schindler's List | 8.9 | 1993 |
Forrest Gump | 8.8 | 1994 |
Inception | 8.7 | 2010 |
df = df.reset_index()
df
title | rating | release_year | |
---|---|---|---|
0 | The Dark Knight | 9.0 | 2008 |
1 | Schindler's List | 8.9 | 1993 |
2 | Forrest Gump | 8.8 | 1994 |
3 | Inception | 8.7 | 2010 |
import pandas as pd
df = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/olympics.csv", index_col=0)
df.head()
# Summer | Gold | Silver | Bronze | Total | # Winter | Gold.1 | Silver.1 | Bronze.1 | Total.1 | # Games | Gold.2 | Silver.2 | Bronze.2 | Combined total | ID | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Afghanistan | 13 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 13 | 0 | 0 | 2 | 2 | AFG |
Algeria | 12 | 5 | 2 | 8 | 15 | 3 | 0 | 0 | 0 | 0 | 15 | 5 | 2 | 8 | 15 | ALG |
Argentina | 23 | 18 | 24 | 28 | 70 | 18 | 0 | 0 | 0 | 0 | 41 | 18 | 24 | 28 | 70 | ARG |
Armenia | 5 | 1 | 2 | 9 | 12 | 6 | 0 | 0 | 0 | 0 | 11 | 1 | 2 | 9 | 12 | ARM |
Australasia | 2 | 3 | 4 | 5 | 12 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 4 | 5 | 12 | ANZ |
def answer_one(df):
"""
這個函數應該回傳一個文字,國家名
"""
return "答案"
answer_one(df)
'United States'
def answer_two(df):
"""
這個函數應該回傳一個文字,國家名
"""
return "答案"
answer_two(df)
'United States'
def answer_three(df):
"""
這個函數應該回傳一個文字,國家名
"""
return "答案"
answer_three(df)
'Bulgaria'
def answer_four(df):
"""
這個函數應該回傳一個 Series,長度為 146
"""
return "答案"
answer_four(df)[:10]
Afghanistan 2 Algeria 27 Argentina 130 Armenia 16 Australasia 22 Australia 923 Austria 569 Azerbaijan 43 Bahamas 24 Bahrain 1 dtype: int64
.isnull()
.notnull()
.dropna()
.fillna()
ser = pd.Series([5, None, 6, np.NaN])
print(ser.isnull())
print("\n")
print(ser.notnull())
0 False 1 True 2 False 3 True dtype: bool 0 True 1 False 2 True 3 False dtype: bool
ser = pd.Series([5, None, 6, np.NaN])
print(ser)
print("\n")
ser.dropna()
0 5.0 1 NaN 2 6.0 3 NaN dtype: float64
0 5.0 2 6.0 dtype: float64
df = pd.DataFrame([
[1, np.nan, 7.],
[2, 5, 8.],
[np.nan, 6, 9.]
])
df
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | NaN | 7.0 |
1 | 2.0 | 5.0 | 8.0 |
2 | NaN | 6.0 | 9.0 |
df.dropna() # default dropping rows with any NaN
0 | 1 | 2 | |
---|---|---|---|
1 | 2.0 | 5.0 | 8.0 |
df.dropna(axis=1) # dropping columns with any NaN
2 | |
---|---|
0 | 7.0 |
1 | 8.0 |
2 | 9.0 |
ser = pd.Series([5, None, 6, np.NaN])
print(ser)
print("\n")
ser.fillna(5566)
0 5.0 1 NaN 2 6.0 3 NaN dtype: float64
0 5.0 1 5566.0 2 6.0 3 5566.0 dtype: float64
ser = pd.Series([5, None, 6, np.NaN, 7])
print(ser)
print("\n")
print(ser.fillna(method='ffill'))
print("\n")
print(ser.fillna(method='bfill'))
0 5.0 1 NaN 2 6.0 3 NaN 4 7.0 dtype: float64 0 5.0 1 5.0 2 6.0 3 6.0 4 7.0 dtype: float64 0 5.0 1 6.0 2 6.0 3 7.0 4 7.0 dtype: float64
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
groupby_object = player_profile.groupby(["pos", "country"])
print(groupby_object["heightMeters"].mean()) # Average height by pos and country
print(type(groupby_object["heightMeters"].mean().index))
pos country C Austria 2.130000 Bahamas 2.160000 Bosnia and Herzegovina 2.130000 Canada 2.060000 Croatia 2.135000 ... G-F France 2.006667 Italy 1.960000 Japan 2.060000 Turkey 2.010000 USA 1.993214 Name: heightMeters, Length: 88, dtype: float64 <class 'pandas.core.indexes.multi.MultiIndex'>
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
groupby_object = player_profile.groupby(["pos", "country"])
ser_w_multi_index = groupby_object["heightMeters"].mean() # Average height by pos and country
print(ser_w_multi_index.values) # values attribute of a multi-index series
print(ser_w_multi_index["G", "USA"]) # average height of USA's guards
print(ser_w_multi_index[["G-F", "F-G"]][:, "USA"]) # average heights of USA's swingmen
[2.13 2.16 2.13 2.06 2.135 2.06 2.13333333 2.11 2.11 2.13 2.13 2.13 2.2 2.17 2.135 2.18 2.08 2.11 2.11961538 2.16 2.08 2.08 2.08 2.08 2.13 2.09 2.06 2.08 2.055 2.06 2.06 2.07 2.082 1.98 2.13 2.00666667 2.06 2.07 2.06666667 2.06 2.11 2.07 2.03 2.07 2.06 2.045 2.07 2.06 2.085 2.06 2.06 2.055 2.03795455 1.995 2.08 2.13 2.11 2.13 2.21 2.09933333 2.02 2.03 2.01 1.98636364 1.92 1.93 1.85 1.94142857 1.96 1.97 1.94 2.03 1.93 1.98 1.9 1.93 1.9216763 2.03 1.98 2.045 2.06 1.98 2.01 2.00666667 1.96 2.06 2.01 1.99321429] 1.9216763005780348 pos F-G 1.986364 G-F 1.993214 Name: heightMeters, dtype: float64
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
wide_format = player_profile[["temporaryDisplayName", "heightMeters", "weightKilograms"]]
wide_format.head()
temporaryDisplayName | heightMeters | weightKilograms | |
---|---|---|---|
0 | Adams, Steven | 2.13 | 120.2 |
1 | Adebayo, Bam | 2.08 | 115.7 |
2 | Adel, Deng | 2.01 | 90.7 |
3 | Aldridge, LaMarcus | 2.11 | 117.9 |
4 | Alexander, Kyle | 2.11 | 99.8 |
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
wide_format = player_profile[["temporaryDisplayName", "heightMeters", "weightKilograms"]]
long_format = pd.melt(wide_format, id_vars="temporaryDisplayName", value_vars=["heightMeters", "weightKilograms"], var_name="key", value_name="value")
long_format.sort_values("temporaryDisplayName").head(10)
temporaryDisplayName | key | value | |
---|---|---|---|
0 | Adams, Steven | heightMeters | 2.13 |
524 | Adams, Steven | weightKilograms | 120.20 |
525 | Adebayo, Bam | weightKilograms | 115.70 |
1 | Adebayo, Bam | heightMeters | 2.08 |
2 | Adel, Deng | heightMeters | 2.01 |
526 | Adel, Deng | weightKilograms | 90.70 |
3 | Aldridge, LaMarcus | heightMeters | 2.11 |
527 | Aldridge, LaMarcus | weightKilograms | 117.90 |
4 | Alexander, Kyle | heightMeters | 2.11 |
528 | Alexander, Kyle | weightKilograms | 99.80 |
long_format.sort_values("temporaryDisplayName").head(10)
temporaryDisplayName | key | value | |
---|---|---|---|
0 | Adams, Steven | heightMeters | 2.13 |
524 | Adams, Steven | weightKilograms | 120.20 |
525 | Adebayo, Bam | weightKilograms | 115.70 |
1 | Adebayo, Bam | heightMeters | 2.08 |
2 | Adel, Deng | heightMeters | 2.01 |
526 | Adel, Deng | weightKilograms | 90.70 |
3 | Aldridge, LaMarcus | heightMeters | 2.11 |
527 | Aldridge, LaMarcus | weightKilograms | 117.90 |
4 | Alexander, Kyle | heightMeters | 2.11 |
528 | Alexander, Kyle | weightKilograms | 99.80 |
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
wide_format = player_profile[["temporaryDisplayName", "heightMeters", "weightKilograms"]]
long_format = pd.melt(wide_format, id_vars="temporaryDisplayName", value_vars=["heightMeters", "weightKilograms"], var_name="key", value_name="value")
long_format.pivot(index="temporaryDisplayName", columns="key", values="value").head()
key | heightMeters | weightKilograms |
---|---|---|
temporaryDisplayName | ||
Adams, Steven | 2.13 | 120.2 |
Adebayo, Bam | 2.08 | 115.7 |
Adel, Deng | 2.01 | 90.7 |
Aldridge, LaMarcus | 2.11 | 117.9 |
Alexander, Kyle | 2.11 | 99.8 |
player_profile = pd.read_csv("https://python4ds.s3-ap-northeast-1.amazonaws.com/player_profile.csv")
wide_format = player_profile[["temporaryDisplayName", "heightMeters", "weightKilograms"]]
long_format = pd.melt(wide_format, id_vars="temporaryDisplayName", value_vars=["heightMeters", "weightKilograms"], var_name="key", value_name="value")
wide_format = long_format.pivot(index="temporaryDisplayName", columns="key", values="value").reset_index()
wide_format = wide_format.rename_axis(None, axis=1)
wide_format.head()
temporaryDisplayName | heightMeters | weightKilograms | |
---|---|---|---|
0 | Adams, Steven | 2.13 | 120.2 |
1 | Adebayo, Bam | 2.08 | 115.7 |
2 | Adel, Deng | 2.01 | 90.7 |
3 | Aldridge, LaMarcus | 2.11 | 117.9 |
4 | Alexander, Kyle | 2.11 | 99.8 |
pd.concat()
df.append()
pd.merge()
df.join()
upper_df = pd.DataFrame()
lower_df = pd.DataFrame()
upper_df["character"] = ["Rachel Green", "Monica Geller", "Phoebe Buffay"]
upper_df["cast"] = ["Jennifer Aniston", "Courteney Cox", "Lisa Kudrow"]
lower_df["character"] = ["Joey Tribbiani", "Chandler Bing", "Ross Geller"]
lower_df["cast"] = ["Matt LeBlanc", "Matthew Perry", "David Schwimmer"]
print("Upper df:")
upper_df
print("Lower df:")
lower_df
print("Concatenated vertically:")
pd.concat([upper_df, lower_df]) # axis=0 as default
Upper df: Lower df: Concatenated vertically:
character | cast | |
---|---|---|
0 | Rachel Green | Jennifer Aniston |
1 | Monica Geller | Courteney Cox |
2 | Phoebe Buffay | Lisa Kudrow |
0 | Joey Tribbiani | Matt LeBlanc |
1 | Chandler Bing | Matthew Perry |
2 | Ross Geller | David Schwimmer |
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["character"] = ["Rachel Green", "Monica Geller", "Phoebe Buffay", "Joey Tribbiani", "Chandler Bing", "Ross Geller"]
right_df["cast"] = ["Jennifer Aniston", "Courteney Cox", "Lisa Kudrow", "Matt LeBlanc", "Matthew Perry", "David Schwimmer"]
print("Left df:")
left_df
print("Right df:")
right_df
print("Concatenated horizontally:")
pd.concat([left_df, right_df], axis=1)
Left df: Right df: Concatenated horizontally:
character | cast | |
---|---|---|
0 | Rachel Green | Jennifer Aniston |
1 | Monica Geller | Courteney Cox |
2 | Phoebe Buffay | Lisa Kudrow |
3 | Joey Tribbiani | Matt LeBlanc |
4 | Chandler Bing | Matthew Perry |
5 | Ross Geller | David Schwimmer |
upper_df = pd.DataFrame()
lower_df = pd.DataFrame()
upper_df["character"] = ["Rachel Green", "Monica Geller", "Phoebe Buffay"]
upper_df["cast"] = ["Jennifer Aniston", "Courteney Cox", "Lisa Kudrow"]
lower_df["character"] = ["Joey Tribbiani", "Chandler Bing", "Ross Geller"]
lower_df["cast"] = ["Matt LeBlanc", "Matthew Perry", "David Schwimmer"]
print("Upper df:")
upper_df
print("Lower df:")
lower_df
print("Concatenated vertically using append method:")
upper_df.append(lower_df)
Upper df: Lower df: Concatenated vertically using append method:
character | cast | |
---|---|---|
0 | Rachel Green | Jennifer Aniston |
1 | Monica Geller | Courteney Cox |
2 | Phoebe Buffay | Lisa Kudrow |
0 | Joey Tribbiani | Matt LeBlanc |
1 | Chandler Bing | Matthew Perry |
2 | Ross Geller | David Schwimmer |
在 Pandas 中若想要高效能操作類似關聯式資料庫表格聯結和合併,主要的實踐函數是 pd.merge() ,她沿用關聯式資料庫的正規法則 Relational Algebra,實踐正規法則所規範四種基礎聯結
#一對一聯結(one-to-one)
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War", "Avengers: Endgame"]
left_df["release_year"] = [2012, 2015, 2018, 2019]
right_df["title"] = ["Avengers: Infinity War", "Avengers: Endgame", "The Avengers", "Avengers: Age of Ultron"]
right_df["rating"] = [8.5, 8.6, 8.5, 7.3]
left_df
right_df
pd.merge(left_df, right_df)
title | release_year | rating | |
---|---|---|---|
0 | The Avengers | 2012 | 8.5 |
1 | Avengers: Age of Ultron | 2015 | 7.3 |
2 | Avengers: Infinity War | 2018 | 8.5 |
3 | Avengers: Endgame | 2019 | 8.6 |
#一對多聯結(one-to-many)
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers"]
left_df["release_year"] = [2012]
right_df["title"] = ["The Avengers", "The Avengers", "The Avengers"]
right_df["genre"] = ["Action", "Adventure", "Sci-Fi"]
left_df
right_df
pd.merge(left_df, right_df)
title | release_year | genre | |
---|---|---|---|
0 | The Avengers | 2012 | Action |
1 | The Avengers | 2012 | Adventure |
2 | The Avengers | 2012 | Sci-Fi |
#多對一聯結(many-to-one)
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers", "The Avengers", "The Avengers"]
left_df["genre"] = ["Action", "Adventure", "Sci-Fi"]
right_df["title"] = ["The Avengers"]
right_df["release_year"] = [2012]
left_df
right_df
pd.merge(left_df, right_df)
title | genre | release_year | |
---|---|---|---|
0 | The Avengers | Action | 2012 |
1 | The Avengers | Adventure | 2012 |
2 | The Avengers | Sci-Fi | 2012 |
#多對多聯結(many-to-many)
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers", "The Avengers", "The Avengers"]
left_df["genre"] = ["Action", "Adventure", "Sci-Fi"]
right_df["title"] = ["The Avengers"]*6
right_df["avengers"] = ["Ironman", "Captain America", "The Hulk", "Thor", "Black Widow", "Hawkeye"]
left_df
right_df
pd.merge(left_df, right_df)
title | genre | avengers | |
---|---|---|---|
0 | The Avengers | Action | Ironman |
1 | The Avengers | Action | Captain America |
2 | The Avengers | Action | The Hulk |
3 | The Avengers | Action | Thor |
4 | The Avengers | Action | Black Widow |
5 | The Avengers | Action | Hawkeye |
6 | The Avengers | Adventure | Ironman |
7 | The Avengers | Adventure | Captain America |
8 | The Avengers | Adventure | The Hulk |
9 | The Avengers | Adventure | Thor |
10 | The Avengers | Adventure | Black Widow |
11 | The Avengers | Adventure | Hawkeye |
12 | The Avengers | Sci-Fi | Ironman |
13 | The Avengers | Sci-Fi | Captain America |
14 | The Avengers | Sci-Fi | The Hulk |
15 | The Avengers | Sci-Fi | Thor |
16 | The Avengers | Sci-Fi | Black Widow |
17 | The Avengers | Sci-Fi | Hawkeye |
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War", "Avengers: Endgame"]
left_df["release_year"] = [2012, 2015, 2018, 2019]
right_df["movie_name"] = ["Avengers: Infinity War", "Avengers: Endgame", "The Avengers", "Avengers: Age of Ultron"]
right_df["rating"] = [8.5, 8.6, 8.5, 7.3]
left_df
right_df
pd.merge(left_df, right_df, left_on="title", right_on="movie_name")
title | release_year | movie_name | rating | |
---|---|---|---|---|
0 | The Avengers | 2012 | The Avengers | 8.5 |
1 | Avengers: Age of Ultron | 2015 | Avengers: Age of Ultron | 7.3 |
2 | Avengers: Infinity War | 2018 | Avengers: Infinity War | 8.5 |
3 | Avengers: Endgame | 2019 | Avengers: Endgame | 8.6 |
#交集(預設)
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War"]
left_df["release_year"] = [2012, 2015, 2018]
right_df["title"] = ["Avengers: Infinity War", "Avengers: Endgame", "Avengers: Age of Ultron"]
right_df["rating"] = [8.5, 8.6, 7.3]
left_df
right_df
print("Inner join:")
pd.merge(left_df, right_df)
Inner join:
title | release_year | rating | |
---|---|---|---|
0 | Avengers: Age of Ultron | 2015 | 7.3 |
1 | Avengers: Infinity War | 2018 | 8.5 |
#以左邊資料框存在的觀測值為主
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War"]
left_df["release_year"] = [2012, 2015, 2018]
right_df["title"] = ["Avengers: Infinity War", "Avengers: Endgame", "Avengers: Age of Ultron"]
right_df["rating"] = [8.5, 8.6, 7.3]
left_df
right_df
print("Left join:")
pd.merge(left_df, right_df, how="left")
Left join:
title | release_year | rating | |
---|---|---|---|
0 | The Avengers | 2012 | NaN |
1 | Avengers: Age of Ultron | 2015 | 7.3 |
2 | Avengers: Infinity War | 2018 | 8.5 |
#以右邊資料框存在的觀測值為主
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War"]
left_df["release_year"] = [2012, 2015, 2018]
right_df["title"] = ["Avengers: Infinity War", "Avengers: Endgame", "Avengers: Age of Ultron"]
right_df["rating"] = [8.5, 8.6, 7.3]
left_df
right_df
print("Right join:")
pd.merge(left_df, right_df, how="right")
Right join:
title | release_year | rating | |
---|---|---|---|
0 | Avengers: Age of Ultron | 2015.0 | 7.3 |
1 | Avengers: Infinity War | 2018.0 | 8.5 |
2 | Avengers: Endgame | NaN | 8.6 |
#聯集
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War"]
left_df["release_year"] = [2012, 2015, 2018]
right_df["title"] = ["Avengers: Infinity War", "Avengers: Endgame", "Avengers: Age of Ultron"]
right_df["rating"] = [8.5, 8.6, 7.3]
left_df
right_df
print("Outer join:")
pd.merge(left_df, right_df, how="outer")
Outer join:
title | release_year | rating | |
---|---|---|---|
0 | The Avengers | 2012.0 | NaN |
1 | Avengers: Age of Ultron | 2015.0 | 7.3 |
2 | Avengers: Infinity War | 2018.0 | 8.5 |
3 | Avengers: Endgame | NaN | 8.6 |
left_df = pd.DataFrame()
right_df = pd.DataFrame()
left_df["title"] = ["The Avengers", "Avengers: Age of Ultron", "Avengers: Infinity War", "Avengers: Endgame"]
left_df["release_year"] = [2012, 2015, 2018, 2019]
right_df["title"] = ["Avengers: Infinity War", "Avengers: Endgame", "The Avengers", "Avengers: Age of Ultron"]
right_df["rating"] = [8.5, 8.6, 8.5, 7.3]
left_df = left_df.set_index("title")
right_df = right_df.set_index("title")
left_df
right_df
left_df.join(right_df)
release_year | rating | |
---|---|---|
title | ||
The Avengers | 2012 | 8.5 |
Avengers: Age of Ultron | 2015 | 7.3 |
Avengers: Infinity War | 2018 | 8.5 |
Avengers: Endgame | 2019 | 8.6 |
import pandas as pd
census_df = pd.read_csv('https://storage.googleapis.com/py_ml_datasets/census.csv')
census_df.shape
(3193, 100)
census_df.head()
SUMLEV | REGION | DIVISION | STATE | COUNTY | STNAME | CTYNAME | CENSUS2010POP | ESTIMATESBASE2010 | POPESTIMATE2010 | ... | RDOMESTICMIG2011 | RDOMESTICMIG2012 | RDOMESTICMIG2013 | RDOMESTICMIG2014 | RDOMESTICMIG2015 | RNETMIG2011 | RNETMIG2012 | RNETMIG2013 | RNETMIG2014 | RNETMIG2015 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 40 | 3 | 6 | 1 | 0 | Alabama | Alabama | 4779736 | 4780127 | 4785161 | ... | 0.002295 | -0.193196 | 0.381066 | 0.582002 | -0.467369 | 1.030015 | 0.826644 | 1.383282 | 1.724718 | 0.712594 |
1 | 50 | 3 | 6 | 1 | 1 | Alabama | Autauga County | 54571 | 54571 | 54660 | ... | 7.242091 | -2.915927 | -3.012349 | 2.265971 | -2.530799 | 7.606016 | -2.626146 | -2.722002 | 2.592270 | -2.187333 |
2 | 50 | 3 | 6 | 1 | 3 | Alabama | Baldwin County | 182265 | 182265 | 183193 | ... | 14.832960 | 17.647293 | 21.845705 | 19.243287 | 17.197872 | 15.844176 | 18.559627 | 22.727626 | 20.317142 | 18.293499 |
3 | 50 | 3 | 6 | 1 | 5 | Alabama | Barbour County | 27457 | 27457 | 27341 | ... | -4.728132 | -2.500690 | -7.056824 | -3.904217 | -10.543299 | -4.874741 | -2.758113 | -7.167664 | -3.978583 | -10.543299 |
4 | 50 | 3 | 6 | 1 | 7 | Alabama | Bibb County | 22915 | 22919 | 22861 | ... | -5.527043 | -5.068871 | -6.201001 | -0.177537 | 0.177258 | -5.088389 | -4.363636 | -5.403729 | 0.754533 | 1.107861 |
5 rows × 100 columns
def answer_one(df):
"""
這個函數應該回傳一個文字,州名
"""
return "答案"
answer_one(census_df)
'Texas'
def answer_two(df):
"""
這個函數應該回傳一個清單,三個州名
"""
return "答案"
answer_two(census_df)
['California', 'Texas', 'Illinois']
提示:如果 6 年的人口數分別為 120, 80, 105, 100, 130, 120 則人口改變數量為 130-80 = 50
def answer_three(df):
"""
這個函數應該回傳一個文字,郡名
"""
return "答案"
answer_three(census_df)
'Harris County'
def answer_four():
"""
這個函數應該回傳一個 DataFrame,外型為 5x2,
變數名稱為 ['STNAME', 'CTYNAME'],
索引值由小到大排列
"""
return "答案"
answer_four(census_df)
STNAME | CTYNAME | |
---|---|---|
0 | Iowa | Washington County |
1 | Minnesota | Washington County |
2 | Pennsylvania | Washington County |
3 | Rhode Island | Washington County |
4 | Wisconsin | Washington County |
pandas
回答最多的上映日期為哪一天?¶answer_one()
release_date 12 July 2019 2 2 September 2019 1 22 April 2019 1 23 April 2019 1 24 April 2019 33 25 April 2019 23 26 April 2019 14 26 July 2019 1 28 April 2019 1 28 June 2019 3 29 April 2019 1 29 June 2019 1 4 July 2019 1 Name: country, dtype: int64
answer_one().idxmax()
'24 April 2019'
pandas
回答有幾個國家在那天上映?¶answer_two()
'共有 33 個國家在 24 April 2019 上映 Avengers: Endgame (2019)'