言語処理100本ノック(問題14〜問題19)

問題14〜15

import io,sys
import codecs
import os
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
argvs = sys.argv

#14
def head_out():
    N=int(argvs[1])
    file_name=argvs[2]
    f=open(file_name,encoding="utf-8")
    line=f.readlines()
    out=line[0:N]
    for l in out:
        print(l)

#15
def tail_out():
    N=int(argvs[1])
    file_name=argvs[2]
    f=open(file_name,encoding="utf-8")
    line=f.readlines()
    out=line[len(line)-N:len(line)]
    for l in out:
        print(l)

問題16

#16
import sys
import codecs
args = sys.argv
f=codecs.open(args[1],encoding="utf-8")
lines=[]
line=f.readline()
l=1
N=int(args[2])
num=0
file_name=args[1].split(".")
while line:
    if l > N:
        num=num+1
        g=codecs.open(file_name[0]+"_%s.txt" % num,"w",encoding="utf-8")
        g.close()
        for l in lines:
            g=codecs.open(file_name[0]+"_%s.txt" % num,"a",encoding="utf-8")
            g.write(l)
            g.close()
        lines=[]
        l=1
    elif line==None:
        num=num+1
        g=codecs.open(file_name[0]+"_%s.txt" % num,"w",encoding="utf-8")
        g.close()
        for l in lines:
            g=codecs.open(file_name[0]+"_%s.txt" % num,"a",encoding="utf-8")
            g.write(l)
            g.close()
        lines=[]
        l=1
    else:
        lines.append(line)
        l=l+1
    line=f.readline()
f.close()

同じことができるUNIXコマンド

fileをN行ごとに分割

split -l N file

分割したファイル名はxaa,xab,xac,...と名付けられる。

問題17〜19

#17
import sys
import codecs
import pandas as pd

args=sys.argv
f=codecs.open(args[1],"r","utf-8")
line=f.readline()
s=[]
while line:
    l=line.split("\t")
    if l[0] not in s:
        s.append(l[0])
    line=f.readline()
print(s)

#17 by pandas
df=pd.read_csv( args[1],header=None,delimiter='\t' )
unique=df[0].unique()
print(unique)

#18 by pandas
order=[]
print(df)
df=df.sort_values(by=2,ascending=False)
print(df)

#19
count=df[0].value_counts()
count=count.sort_values(ascending=False)
print(count)