Skip to content

Changes to export PostHistory and Comments data files to PostGres database #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 15, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.todo
*.7z
*.xml
*.pyc
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
- `python load_into_pg.py Tags` (not present in earliest dumps)
- `python load_into_pg.py Users`
- `python load_into_pg.py Votes`
- `python load_into_pg.py PostHistory`
- `python load_into_pg.py Comments`
- Finally, after all the initial tables have been created:
- `psql stackoverflow < ./sql/final_post.sql`
- If you used a different database name, make sure to use that instead of
Expand Down
30 changes: 24 additions & 6 deletions load_into_pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
if pre != '':
cur.execute(pre)
conn.commit()
print 'Pre-processing took {} seconds'.format(time.time() - start_time)
print 'Pre-processing took {:.1f} seconds'.format(time.time() - start_time)

# Handle content of the table
start_time = time.time()
Expand All @@ -91,7 +91,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
' VALUES\n' + valuesStr + ';'
cur.execute(cmd)
conn.commit()
print 'Table processing took {} seconds'.format(time.time() - start_time)
print 'Table processing took {:.1f} seconds'.format(time.time() - start_time)

# Post-processing (creation of indexes)
start_time = time.time()
Expand Down Expand Up @@ -119,7 +119,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
parser = argparse.ArgumentParser()
parser.add_argument( 'table'
, help = 'The table to work on.'
, choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes']
, choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes', 'PostHistory', 'Comments']
)

parser.add_argument( '-d', '--dbname'
Expand All @@ -134,12 +134,12 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas

parser.add_argument( '-u', '--username'
, help = 'Username for the database.'
, default = None
, default = 'postgres'
)

parser.add_argument( '-p', '--password'
, help = 'Password for the database.'
, default = None
, default = 'fibinse'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ouch.

)

parser.add_argument( '-P', '--port'
Expand Down Expand Up @@ -232,7 +232,25 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
, 'ExcerptPostId'
, 'WikiPostId'
]

elif table == 'PostHistory':
keys = [
'Id',
'PostHistoryTypeId',
'PostId',
'RevisionGUID',
'CreationDate',
'UserId',
'Text'
]
elif table == 'Comments':
keys = [
'Id',
'PostId',
'Score',
'Text',
'CreationDate',
'UserId',
]
choice = raw_input('This will drop the {} table. Are you sure [y/n]?'.format(table))

if len(choice) > 0 and choice[0].lower() == 'y':
Expand Down
11 changes: 11 additions & 0 deletions sql/Comments_post.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
-- hash index takes too long to create
CREATE INDEX cmnts_post_type_id_idx ON Comments USING btree (Score)
WITH (FILLFACTOR = 100);
CREATE INDEX cmnts_postid_idx ON Comments USING hash (PostId)
WITH (FILLFACTOR = 100);
CREATE INDEX cmnts_revguid_idx ON Comments USING btree (RevisionGUID)
WITH (FILLFACTOR = 100);
CREATE INDEX cmnts_creation_date_idx ON Comments USING btree (CreationDate)
WITH (FILLFACTOR = 100);
CREATE INDEX cmnts_userid_idx ON Comments USING btree (UserId)
WITH (FILLFACTOR = 100);
9 changes: 9 additions & 0 deletions sql/Comments_pre.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
DROP TABLE IF EXISTS Tags CASCADE;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a typo, it should be Comments instead. I'll fix it before merging it.

CREATE TABLE Comments (
Id int PRIMARY KEY ,
PostId int,
Score int,
Post_Text text,
CreationDate timestamp not NULL ,
UserId int
);
34 changes: 34 additions & 0 deletions sql/PostHistory_post.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
-- hash index takes too long to create
CREATE INDEX ph_post_type_id_idx ON PostHistory USING btree (PostHistoryTypeId)
WITH (FILLFACTOR = 100);
CREATE INDEX ph_postid_idx ON PostHistory USING hash (PostId)
WITH (FILLFACTOR = 100);
CREATE INDEX ph_revguid_idx ON PostHistory USING btree (RevisionGUID)
WITH (FILLFACTOR = 100);
CREATE INDEX ph_creation_date_idx ON PostHistory USING btree (CreationDate)
WITH (FILLFACTOR = 100);
CREATE INDEX ph_userid_idx ON PostHistory USING btree (UserId)
WITH (FILLFACTOR = 100);


CREATE TABLE accepted_answer_id as SELECT DISTINCT acceptedanswerid FROM posts;

DROP TABLE qn_ans_timing;

CREATE TABLE qn_ans_timing AS
SELECT
p.id,
p.tags,
p.owneruserid,
p.creationdate as qn_creation_ts,
h.creationdate as ans_creation_ts,
p.acceptedanswerid,
h.userid as answered_by
FROM posts p, posthistory h
WHERE p.acceptedanswerid is not null
AND h.postid in (
SELECT *
FROM accepted_answer_id
)

AND p.acceptedanswerid=h.postid;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels a bit too ad-hoc to be part of the default tables to be created. Perhaps this could be put in optional_post.sql?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you think it is still valuable, create a new PR with a short comment about its importance and change the names of the fields to conform with the other tables. It is unfortunate that the names are case insensitive, but it is better to be consistent now rather than making the user guess the field names per table.

10 changes: 10 additions & 0 deletions sql/PostHistory_pre.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
DROP TABLE IF EXISTS Tags CASCADE;
CREATE TABLE PostHistory (
Id int PRIMARY KEY ,
PostHistoryTypeId int,
PostId int,
RevisionGUID text,
CreationDate timestamp not NULL ,
UserId int,
PostText text
);