|
| 1 | +import numpy as np |
| 2 | +import os |
| 3 | +from scipy.misc import imread,imresize |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +def fetch_lfw_dataset(attrs_name = "lfw_attributes.txt", |
| 7 | + images_name = "lfw-deepfunneled", |
| 8 | + dx=80,dy=80, |
| 9 | + dimx=45,dimy=45 |
| 10 | + ):#sad smile |
| 11 | + |
| 12 | + #download if not exists |
| 13 | + if not os.path.exists(images_name): |
| 14 | + print("images not found, donwloading...") |
| 15 | + os.system("wget http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz -O tmp.tgz") |
| 16 | + print("extracting...") |
| 17 | + os.system("tar xvzf tmp.tgz && rm tmp.tgz") |
| 18 | + print("done") |
| 19 | + assert os.path.exists(images_name) |
| 20 | + |
| 21 | + if not os.path.exists(attrs_name): |
| 22 | + print("attributes not found, downloading...") |
| 23 | + os.system("wget http://www.cs.columbia.edu/CAVE/databases/pubfig/download/%s"%attrs_name) |
| 24 | + print("done") |
| 25 | + |
| 26 | + #read attrs |
| 27 | + df_attrs = pd.read_csv("lfw_attributes.txt",sep='\t',skiprows=1,) |
| 28 | + df_attrs = pd.DataFrame(df_attrs.iloc[:,:-1].values, columns = df_attrs.columns[1:]) |
| 29 | + |
| 30 | + |
| 31 | + #read photos |
| 32 | + photo_ids = [] |
| 33 | + for dirpath, dirnames, filenames in os.walk(images_name): |
| 34 | + for fname in filenames: |
| 35 | + if fname.endswith(".jpg"): |
| 36 | + fpath = os.path.join(dirpath,fname) |
| 37 | + photo_id = fname[:-4].replace('_',' ').split() |
| 38 | + person_id = ' '.join(photo_id[:-1]) |
| 39 | + photo_number = int(photo_id[-1]) |
| 40 | + photo_ids.append({'person':person_id,'imagenum':photo_number,'photo_path':fpath}) |
| 41 | + |
| 42 | + photo_ids = pd.DataFrame(photo_ids) |
| 43 | + |
| 44 | + #mass-merge |
| 45 | + #(photos now have same order as attributes) |
| 46 | + df = pd.merge(df_attrs,photo_ids,on=('person','imagenum')) |
| 47 | + |
| 48 | + assert len(df)==len(df_attrs),"lost some data when merging dataframes" |
| 49 | + |
| 50 | + #image preprocessing |
| 51 | + all_photos =df['photo_path'].apply(imread)\ |
| 52 | + .apply(lambda img:img[dy:-dy,dx:-dx])\ |
| 53 | + .apply(lambda img: imresize(img,[dimx,dimy])) |
| 54 | + |
| 55 | + all_photos = np.stack(all_photos.values).astype('uint8') |
| 56 | + all_attrs = df.drop(["photo_path","person","imagenum"],axis=1) |
| 57 | + |
| 58 | + return all_photos,all_attrs |
| 59 | + |
0 commit comments