1012 Python pandas DataFrame

Develop/Python

1012 Python pandas DataFrame

포페PostFace 2022. 10. 13. 17:19

# Series : 객체를 담을 수 있는 1차원 배열과 같은 자료구조
# DataFrame : 2차원 형태의 행/열 구조를 가지는 자료구조. 열 1개가 하나의 Series 구조다.

import numpy as np
import pandas as pd

arr = np.array([[1,2,3],[4,5,6]])
print('arr:\n',arr)
print(type(arr),arr.ndim,arr.shape) #<class 'numpy.ndarray'> 2 (2, 3)
print(arr[0][1]) #2

df=pd.DataFrame([[1,2,3],[4,5,6]])
print('df:\n',df)
'''
   0  1  2
0  1  2  3
1  4  5  6  
'''
print(type(df),df.ndim,df.shape)
print(df[0][1]) #[칼럼][인덱스] 4 
print(df[1][0]) #[칼럼][인덱스] 2 

d = {'a' : [10,20],'b' : [30,40],'c':[50,60]}
df2 = pd.DataFrame(d)
print('df2:\n',df2)
'''
    a   b   c
0  10  30  50
1  20  40  60
'''
print()

df3 =pd.DataFrame(data=[[1,2,3,4],[5,6,7,8]] ,
                  index=range(0,2) ,
                  columns=['A','B','C','D'])
print('df3:\n',df3)
'''
   A  B  C  D
0  1  2  3  4
1  5  6  7  8
'''

s1 = pd.Series([10,20,30,40])
s2 = pd.Series(['A','B','C','D'])

df4= pd.DataFrame([s1,s2])
print('df4:\n',df4)
'''
    0   1   2   3
0  10  20  30  40
1   A   B   C   D
'''
df5 = pd.DataFrame(columns=['도시','year','pop'],
                   index=range(0,5),
                   data=[['서울',2000,1.5],
                         ['서울',2001,1.7],
                         ['서울',2002,3.6],
                         ['부산',2001,2.4],
                         ['부산',2002,2.9]])
print('df5:\n',df5)
'''
   도시  year  pop
0  서울  2000  1.5
1  서울  2001  1.7
2  서울  2002  3.6
3  부산  2001  2.4
4  부산  2002  2.9
'''
print(df5.index)
print(df5.columns)
print(df5.values)

print(df5['도시'])
print(type(df5['도시']))

print(df5["year"])
print(df5.year)

print(df5[['도시','year']])

print(df5['도시'])

#{column : {index:value,index:value},
#column : {index:value,index:value},
#column : {index:value,index:value}}
df6 = { 'Korea': { 'capital':'서울', 'population':51821669 },
           'China': { 'capital':'베이징', 'population':1444216102 },
           'Singapore': { 'capital':'싱가포르', 'population':5896684 },
           'Vietnam': { 'capital':'하노이', 'population':98168829 } }

print('df6:\n',df6)
df6=pd.DataFrame(df6)
print('df6:\n',df6)
'''
            Korea      China    Singapore   Vietnam
capital      서울        베이징      싱가포르     하노이
population  51821669  1444216102   5896684  98168829
'''
print('df5:\n',df5)

#df5['income'] = 1000,2000,3000,4000,5000
df5['income'] = 1000
print('df5:\n',df5)
df5['income'] = [11,12,13,14,15]
print('df5:\n',df5)

s =pd.Series([3,7,9],index=[0,2,4])
print('s:',s)
df5['income']=s
print('df5:\n',df5)
df7= {'사과':{2001 : 10,2002:20,2003:30},
      '포도':{2001 : 30,2002:40,2003:50}}
df7=pd.DataFrame(df7)
print('df7:\n',df7)
print(df7.T) #column->index,index->column 전치한다.
print(df7.transpose())
print(np.transpose(df7)) #셋 다 같은 전치다.
from pandas import DataFrame
df8 = DataFrame(data=[[0,1,2,3],[4,5,6,7],[8,9,10,11],[12,13,14,15]],
                   index=['one','two','three','four'],
                   columns=['서울','부산','광주','대구'])
print('df8:\n',df8)
idx=['one','two','three','four']
col=['서울','부산','광주','대구']
DataFrame(np.arange(16).reshape([4,4]),
          idx,
          col)
print(df8.reindex(index=['one','four'])) # index= 생략 가능
df8_1 = df8.drop(['two','three'])
print('df8_1:\n',df8_1)

df8_2=df8.reindex(columns=['부산','광주']) # columns= 생략 불가능
print('df8_2:\n',df8_2)
df8_3=df8.drop(columns=['부산','광주']) 
print('df8_3:\n',df8_3)
df8_4=df8.drop(['부산','광주'],axis=1) 
print('df8_4:\n',df8_4)
print(df8.reindex(index=['one','three'],columns=['부산','광주']))
df8_5=df8.drop(index=['two','four'],columns=['서울','대구'])
print('df8_5:\n',df8_5)

d={49:'A',48:'B',47:'C',46:'D',2:'E',3:'F',4:'G'}
s= pd.Series(d)
print('s:',s)
print(s[3])#F
print(s[47])#C
print(s.loc[3])#F
print(s.loc[47])#C
print(s.iloc[3]) #위치번호(인덱스 설정값과 무관함)
#print(s.iloc[47]) #error
print(s.loc[:3]) #3인덱스까지 3도 포함함
print(s.iloc[:3]) #3번째 위치 포함 안함

print('---------------------------------------------')
print('df8:\n',df8)

print(df8.loc['two'])
'''
서울    4
부산    5
광주    6
대구    7
'''
print(df8.loc[['two']])
'''
    서울  부산  광주  대구
two   4   5   6   7
'''
print(df8.iloc[1])

df9=df8.loc[['two','three']]
print('df9:\n',df9)
print(df8.reindex(index=['two','three']))
df8_6=df8.drop(index=['one','four'])
print('df8_6:\n',df8_6)

print(df8['서울']['two'])

print(df8.loc['two']['서울']) #loc은 인덱스 먼저
print(df8.loc[['two'],['서울']]) #DataFrame
print(df8.loc[['two','three'],['서울','광주']]) #DataFrame
print(df8.reindex(index=['two','three'],columns=['서울','광주'])) #DataFrame
df8_7=df8.drop(index=['one','four'],columns=['부산','대구'])
print(df8_7)

print(df8['부산']>5)
print(df8[df8['부산']>5])
print(df8[df8['대구']>10])
print('-------------------')
print(np.sum(df8))
print(df8.apply(np.sum))
print(df8.apply(np.sum,axis=0)) #칼럼별 합계
print(df8.apply(np.sum,axis=1)) #인덱스별 합계
print(df8.apply(np.sum,axis='index')) #칼럼별 합계
print(df8.apply(np.sum,axis='columns')) #인덱스별 합계
print(df8.apply(np.max,axis=0)) 
print(df8.apply(np.min,axis=1))
df8['제주']=[10,20,30,40]
print('df8:\n',df8) 
df8.loc['five']=[3,6,9,12,15]
print('df8:\n',df8) 

print(df8.sort_index()) 
print(df8.sort_index(axis=1)) 
print(df8.sort_index(axis=1,ascending=False)) 
print(df8.sort_values(by='부산')) #부산 칼럼 기준 값 오름차순 정렬
print(df8.sort_values(by='부산',ascending=False)) #부산 칼럼 기준 값 내림차순 정렬
print(df8.sort_values(by=['대구','제주'],ascending=[False,True])) 

d= {'국어':{'정연':40,'사나':50,'지효':50,'채영':11},
    '영어':{'정연':70,'사나':50,'지효':20,'채영':22},
    '수학':{'정연':20,'사나':20,'지효':20,'채영':33}}
score=pd.DataFrame(d)
print('score:\n',score)
print(score.apply(np.sum,axis=1))
# 사람별 합계
# 정연 180
# 사나 120
# 지효 90
# 채영 66
#
print(score.apply(np.sum, axis='columns'))
print()

# 과목별 최고점수
# 국어    50
# 영어    70
# 수학    33
print(score.apply(np.max, axis=0))
print()

# 수학 내림, 영어 오름
#     국어  영어  수학
# 채영  11  22  33
# 지효  50  20  20
# 사나  50  50  20
# 정연  40  70  20

print(score.sort_values(by=['수학','영어'],ascending=[False,True]))
print()

#          kim  park  jung
# apple     3     5   1
# banana    9     2   2

from pandas import DataFrame
import pandas as pd
import numpy as np

a1 = DataFrame(data = [[3,np.NaN ,1],[9,2]],
                index = ['apple','banana'],
                columns = ['kim','park','jung']
                )
print('a1:\n',a1)
print()

df={
    'kim':{'apple':3,'banana':9},
    'park':{'banana':2},
    'jung':{'apple':1,'banana':2},
    }
df = pd.DataFrame(df)
print('df:\n',df)
filename = 'mynan.csv'
table = pd.read_csv(filename,encoding='euc-kr',index_col=0)
print(table)
'''
pd.read_csv(filename,encoding='euc-kr')
 Unnamed: 0 name   kor   eng
0           0  김철수  50.0  30.0
1           1  박순희  40.0   NaN
2           2  홍길동   NaN   NaN
table = pd.read_csv(filename,encoding='euc-kr',index_col=0)
  name   kor   eng
0  김철수  50.0  30.0
1  박순희  40.0   NaN
2  홍길동   NaN   NaN
'''
print(type(table)) #<class 'pandas.core.frame.DataFrame'>
print()
print(table.isna())
'''
    name    kor    eng
0  False  False  False
1  False  False   True
2  False   True   True
'''
print()
print(pd.isna(table))
print()
print(table.notnull())
print()
table2=table.dropna() #nan이 있으면 삭제
print('table2:\n',table2)
'''
   name   kor   eng
0  김철수  50.0  30.0
'''
print()
table2=table.dropna(subset='kor') #kor에 nan이 있으면 삭제
print('table2:\n',table2)
'''
   name   kor   eng
0  김철수  50.0  30.0
1  박순희  40.0   NaN
'''
print()
print('table:\n',table)
print()
table=table.fillna({'kor':20,'eng':70})
print('table:\n',table)
print()

import pandas as pd
import numpy as np

# 1. DataFrame 생성
# print('df:\n',df)
#      이름    나이   주소  
# 1    지수    30    서울
# 2    제니    27    부산
# 3    로제    47    제주
# 4    리사    19    대구
filename = 'member.csv'
df = pd.read_csv(filename,encoding='euc-kr',index_col=0)
print('df:\n',df)
# 2. 성별 추가
# print('df:\n',df)
#      이름    나이   주소    성별
# 1    지수    30    서울    남
# 2    제니    27    부산    여
# 3    로제    47    제주    남
# 4    리사    19    대구    여
df['성별']=['남','여','남','여']
print('df:\n',df)

# 3. 1, 4의 이름, 성별만 출력
# print('df:\n',df)
#      이름  성별
# 1    지수   남
# 4    리사   여
print('df:\n',df.loc[[1,4],['이름','성별']])

# 4.모든 사람의 나이 평균 : 나이    30.75
print('나이 ',np.average(df['나이']))
# 5. 나이 30이상인 데이터만 가져오기
#    이름  나이  주소 성별
# 1  지수  30  서울  남
# 3  로제  47  제주  남
print('df:\n',df[df['나이']>=30])

# 6. 성별 오름차순, 나이 내림차순 정렬
#   이름  나이  주소 성별
# 3  로제  47  제주  남
# 1  지수  30  서울  남
# 2  제니  27  부산  여
# 4  리사  19  대구  여
print(df.sort_values(by=['성별','나이'],ascending=[True,False]))

'Develop > Python' 카테고리의 다른 글

1013 Python csv (0)	2022.10.13
1011 Python Numpy,pandas(맛보기) (1)	2022.10.13
1007 Python Class,예외처리,DB (1)	2022.10.13
1006 Python 파일 입출력,클래스 (0)	2022.10.06
1005 Python list 이어서,튜플,사전,함수,모듈 (1)	2022.10.06

현재글1012 Python pandas DataFrame

포페의 개발공부

Today :
Yesterday :

포페의 개발공부

1012 Python pandas DataFrame

'Develop > Python' 카테고리의 다른 글

'Develop/Python'의 다른글

티스토리툴바

« 2025/02 »
일	월	화	수	목	금	토
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28

1012 Python pandas DataFrame

'Develop > Python' 카테고리의 다른 글

'Develop/Python'의 다른글

관련글

티스토리툴바