言語処理100本ノック(問題14〜問題19)
問題14〜15
import io,sys import codecs import os sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') argvs = sys.argv #14 def head_out(): N=int(argvs[1]) file_name=argvs[2] f=open(file_name,encoding="utf-8") line=f.readlines() out=line[0:N] for l in out: print(l) #15 def tail_out(): N=int(argvs[1]) file_name=argvs[2] f=open(file_name,encoding="utf-8") line=f.readlines() out=line[len(line)-N:len(line)] for l in out: print(l)
問題16
#16 import sys import codecs args = sys.argv f=codecs.open(args[1],encoding="utf-8") lines=[] line=f.readline() l=1 N=int(args[2]) num=0 file_name=args[1].split(".") while line: if l > N: num=num+1 g=codecs.open(file_name[0]+"_%s.txt" % num,"w",encoding="utf-8") g.close() for l in lines: g=codecs.open(file_name[0]+"_%s.txt" % num,"a",encoding="utf-8") g.write(l) g.close() lines=[] l=1 elif line==None: num=num+1 g=codecs.open(file_name[0]+"_%s.txt" % num,"w",encoding="utf-8") g.close() for l in lines: g=codecs.open(file_name[0]+"_%s.txt" % num,"a",encoding="utf-8") g.write(l) g.close() lines=[] l=1 else: lines.append(line) l=l+1 line=f.readline() f.close()
同じことができるUNIXコマンド
fileをN行ごとに分割
split -l N file
分割したファイル名はxaa,xab,xac,...と名付けられる。
問題17〜19
#17 import sys import codecs import pandas as pd args=sys.argv f=codecs.open(args[1],"r","utf-8") line=f.readline() s=[] while line: l=line.split("\t") if l[0] not in s: s.append(l[0]) line=f.readline() print(s) #17 by pandas df=pd.read_csv( args[1],header=None,delimiter='\t' ) unique=df[0].unique() print(unique) #18 by pandas order=[] print(df) df=df.sort_values(by=2,ascending=False) print(df) #19 count=df[0].value_counts() count=count.sort_values(ascending=False) print(count)