Skip to content

Commit 5b10e89

Browse files
author
Your Name
committed
add dataset downloader
1 parent adcf196 commit 5b10e89

File tree

2 files changed

+89
-30
lines changed

2 files changed

+89
-30
lines changed

week12/VAE_homework.ipynb

Lines changed: 30 additions & 30 deletions
Large diffs are not rendered by default.

week12/lfw_dataset.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import numpy as np
2+
import os
3+
from scipy.misc import imread,imresize
4+
import pandas as pd
5+
6+
def fetch_lfw_dataset(attrs_name = "lfw_attributes.txt",
7+
images_name = "lfw-deepfunneled",
8+
dx=80,dy=80,
9+
dimx=45,dimy=45
10+
):#sad smile
11+
12+
#download if not exists
13+
if not os.path.exists(images_name):
14+
print("images not found, donwloading...")
15+
os.system("wget http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz -O tmp.tgz")
16+
print("extracting...")
17+
os.system("tar xvzf tmp.tgz && rm tmp.tgz")
18+
print("done")
19+
assert os.path.exists(images_name)
20+
21+
if not os.path.exists(attrs_name):
22+
print("attributes not found, downloading...")
23+
os.system("wget http://www.cs.columbia.edu/CAVE/databases/pubfig/download/%s"%attrs_name)
24+
print("done")
25+
26+
#read attrs
27+
df_attrs = pd.read_csv("lfw_attributes.txt",sep='\t',skiprows=1,)
28+
df_attrs = pd.DataFrame(df_attrs.iloc[:,:-1].values, columns = df_attrs.columns[1:])
29+
30+
31+
#read photos
32+
photo_ids = []
33+
for dirpath, dirnames, filenames in os.walk(images_name):
34+
for fname in filenames:
35+
if fname.endswith(".jpg"):
36+
fpath = os.path.join(dirpath,fname)
37+
photo_id = fname[:-4].replace('_',' ').split()
38+
person_id = ' '.join(photo_id[:-1])
39+
photo_number = int(photo_id[-1])
40+
photo_ids.append({'person':person_id,'imagenum':photo_number,'photo_path':fpath})
41+
42+
photo_ids = pd.DataFrame(photo_ids)
43+
44+
#mass-merge
45+
#(photos now have same order as attributes)
46+
df = pd.merge(df_attrs,photo_ids,on=('person','imagenum'))
47+
48+
assert len(df)==len(df_attrs),"lost some data when merging dataframes"
49+
50+
#image preprocessing
51+
all_photos =df['photo_path'].apply(imread)\
52+
.apply(lambda img:img[dy:-dy,dx:-dx])\
53+
.apply(lambda img: imresize(img,[dimx,dimy]))
54+
55+
all_photos = np.stack(all_photos.values).astype('uint8')
56+
all_attrs = df.drop(["photo_path","person","imagenum"],axis=1)
57+
58+
return all_photos,all_attrs
59+

0 commit comments

Comments
 (0)