2025-05-18
“Sequel”, not “Ess-Queue-Ell”
pandas
treats tables as DataFrames
pandas
etc.Why should you use databases instead of CSVs?
sqlalchemy
sqlite
database using sqlalchemy
create_engine
.db
file called auctions.db
that contains data on bidding for 500 North Face clothing items on ShopGoodwill.com1sqlalchemy
engine:create_engine
Argumentsqlalchemy
what sort of database we’re connecting to/
characters before the database location.['bids', 'items']
So we have two tables, named “bids” and “items”
sqlalchemy
or pyspark
have methods to take the place of queryingTo assess the output of our queries, we’re going to write a class that will run our query against the database and return a DataFrame
as the table output.
import pandas as pd
from sqlalchemy.orm import Session
class DataBase:
def __init__(self, loc: str, db_type: str = "sqlite") -> None:
"""Initialize the class and connect to the database"""
self.loc = loc
self.db_type = db_type
self.engine = create_engine(f'{self.db_type}:///{self.loc}')
def query(self, q: str) -> pd.DataFrame:
"""Run a query against the database and return a DataFrame"""
with Session(self.engine) as session:
df = pd.read_sql(q, session.bind)
return(df)
auctions = DataBase(path)
SELECT *
Statement1 index bidLogId itemId itemPrice bidAmount \
0 50 0 178348858 9.99 20.0
1 51 0 178348858 13.00 12.0
2 52 0 178348858 21.00 23.0
3 53 0 178348858 24.00 35.0
4 54 0 178348858 36.00 48.0
bidTime quantity bidIPAddress adCode serverIP \
0 2023-09-18 16:11:04.587000 1 None None None
1 2023-09-22 14:22:06.700000 1 None None None
2 2023-09-23 12:35:18.157000 1 None None None
3 2023-09-23 18:23:27.993000 1 None None None
4 2023-09-23 18:37:47.213000 1 None None None
retracted bidderName highBidderName isBuyerHighBidder isLoggedInBuyer
0 0 a****9 a****9 0 0
1 0 S****p a****9 0 0
2 0 H****2 H****2 0 0
3 0 b****e b****e 0 0
4 0 t****5 t****5 0 0
SELECT
Columns Statement itemId description isBuyNowUsed
0 179353985 <p><strong>Description:</strong></p>\n<p>Women... 0
1 177087535 <p><strong>Details & Condition</strong></p... 0
2 180876361 <p>The North Face Womens Pink Long Sleeve Mock... 0
3 177763109 <p><br></p><ul><li><span class="ql-size-large"... 0
4 179660197 <p><b>Title: </b>The North Face Mens Red Flat ... 0
JOIN
StatementsRecall our discussion on joining in pandas
– these are SQL-style joins, and SQL has the same types.
itemId description bidderName \
0 179353985 <p><strong>Description:</strong></p>\n<p>Women... None
1 177087535 <p><strong>Details & Condition</strong></p... None
2 180876361 <p>The North Face Womens Pink Long Sleeve Mock... B****4
3 177763109 <p><br></p><ul><li><span class="ql-size-large"... n****m
4 177763109 <p><br></p><ul><li><span class="ql-size-large"... N****0
bidAmount bidTime
0 NaN None
1 NaN None
2 19.99 2023-10-18 05:54:55.327000
3 10.00 2023-09-17 11:52:27.447000
4 14.00 2023-09-17 17:33:48.517000
JOIN
AliasesShould this run?
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: ambiguous column name: itemid The above exception was the direct cause of the following exception: OperationalError Traceback (most recent call last) Cell In[7], line 7 1 q = """ 2 select itemid, description, biddername, bidamount, bidtime 3 from items 4 left join bids 5 on items.itemid = bids.itemid 6 """ ----> 7 print(auctions.query(q).head()) Cell In[3], line 13, in DataBase.query(self, q) 11 """Run a query against the database and return a DataFrame""" 12 with Session(self.engine) as session: ---> 13 df = pd.read_sql(q, session.bind) 14 return(df) File ~/anaconda3/envs/website/lib/python3.12/site-packages/pandas/io/sql.py:734, in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize, dtype_backend, dtype) 724 return pandas_sql.read_table( 725 sql, 726 index_col=index_col, (...) 731 dtype_backend=dtype_backend, 732 ) 733 else: --> 734 return pandas_sql.read_query( 735 sql, 736 index_col=index_col, 737 params=params, 738 coerce_float=coerce_float, 739 parse_dates=parse_dates, 740 chunksize=chunksize, 741 dtype_backend=dtype_backend, 742 dtype=dtype, 743 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/pandas/io/sql.py:1836, in SQLDatabase.read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize, dtype, dtype_backend) 1779 def read_query( 1780 self, 1781 sql: str, (...) 1788 dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", 1789 ) -> DataFrame | Iterator[DataFrame]: 1790 """ 1791 Read SQL query into a DataFrame. 1792 (...) 1834 1835 """ -> 1836 result = self.execute(sql, params) 1837 columns = result.keys() 1839 if chunksize is not None: File ~/anaconda3/envs/website/lib/python3.12/site-packages/pandas/io/sql.py:1659, in SQLDatabase.execute(self, sql, params) 1657 args = [] if params is None else [params] 1658 if isinstance(sql, str): -> 1659 return self.con.exec_driver_sql(sql, *args) 1660 return self.con.execute(sql, *args) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1775, in Connection.exec_driver_sql(self, statement, parameters, execution_options) 1770 execution_options = self._execution_options.merge_with( 1771 execution_options 1772 ) 1774 dialect = self.dialect -> 1775 ret = self._execute_context( 1776 dialect, 1777 dialect.execution_ctx_cls._init_statement, 1778 statement, 1779 None, 1780 execution_options, 1781 statement, 1782 distilled_parameters, 1783 ) 1785 return ret File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1842, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw) 1840 return self._exec_insertmany_context(dialect, context) 1841 else: -> 1842 return self._exec_single_context( 1843 dialect, context, statement, parameters 1844 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1982, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1979 result = context._setup_result_proxy() 1981 except BaseException as e: -> 1982 self._handle_dbapi_exception( 1983 e, str_statement, effective_parameters, cursor, context 1984 ) 1986 return result File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:2351, in Connection._handle_dbapi_exception(self, e, statement, parameters, cursor, context, is_sub_exec) 2349 elif should_wrap: 2350 assert sqlalchemy_exception is not None -> 2351 raise sqlalchemy_exception.with_traceback(exc_info[2]) from e 2352 else: 2353 assert exc_info[1] is not None File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1961 break 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: 1968 self.dispatch.after_cursor_execute( 1969 self, 1970 cursor, (...) 1974 context.executemany, 1975 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: (sqlite3.OperationalError) ambiguous column name: itemid [SQL: select itemid, description, biddername, bidamount, bidtime from items left join bids on items.itemid = bids.itemid ] (Background on this error at: https://sqlalche.me/e/20/e3q8)
JOIN
Renaming TablesIt’s often convenient to rename tables in joins to make your query less verbose (potentially at the cost of readability)
itemId description bidderName \
0 179353985 <p><strong>Description:</strong></p>\n<p>Women... None
1 177087535 <p><strong>Details & Condition</strong></p... None
2 180876361 <p>The North Face Womens Pink Long Sleeve Mock... B****4
3 177763109 <p><br></p><ul><li><span class="ql-size-large"... n****m
4 177763109 <p><br></p><ul><li><span class="ql-size-large"... N****0
bidAmount bidTime
0 NaN None
1 NaN None
2 19.99 2023-10-18 05:54:55.327000
3 10.00 2023-09-17 11:52:27.447000
4 14.00 2023-09-17 17:33:48.517000
For each of the join types supported in sqlite
(left, inner, cross), perform the join on the two tables and report the number of observations in the resulting join.
[551, 879, 551]
WHERE
itemId description bidderName \
0 178348858 <p><br></p><ul><li><span class="ql-size-large"... a****9
1 178348858 <p><br></p><ul><li><span class="ql-size-large"... S****p
2 178348858 <p><br></p><ul><li><span class="ql-size-large"... H****2
3 178348858 <p><br></p><ul><li><span class="ql-size-large"... b****e
4 178348858 <p><br></p><ul><li><span class="ql-size-large"... t****5
bidAmount bidTime
0 20.0 2023-09-18 16:11:04.587000
1 12.0 2023-09-22 14:22:06.700000
2 23.0 2023-09-23 12:35:18.157000
3 35.0 2023-09-23 18:23:27.993000
4 48.0 2023-09-23 18:37:47.213000
WHERE
With Multiple Conditions itemId description bidderName \
0 180876361 <p>The North Face Womens Pink Long Sleeve Mock... B****4
1 177763109 <p><br></p><ul><li><span class="ql-size-large"... n****m
2 177763109 <p><br></p><ul><li><span class="ql-size-large"... N****0
3 177763109 <p><br></p><ul><li><span class="ql-size-large"... M****y
4 177763109 <p><br></p><ul><li><span class="ql-size-large"... M****y
bidAmount bidTime
0 19.99 2023-10-18 05:54:55.327000
1 10.00 2023-09-17 11:52:27.447000
2 14.00 2023-09-17 17:33:48.517000
3 15.00 2023-09-17 18:27:00.087000
4 15.00 2023-09-17 18:33:37.233000
GROUP BY
The same as .groupby()
in pandas
– add aggregating functions to the SELECT
clause
itemId n_bidders
0 165561698 1
1 170983900 1
2 172998011 2
3 173907435 1
4 174445924 3
COUNT
We can also just count observations without a grouping:
count(*)
0 500
MIN
and MAX
In SQL, MIN
and MAX
are aggregating functions that work the same way as COUNT
. Use them to create a table of the number of bids each bidder submitted for each item, as well as their largest and smallest bid.
MIN
and MAX
itemId bidderName n_bids min_bid max_bid
0 165561698 n****4 1 9.91 9.91
1 170983900 c****3 1 9.91 9.91
2 172998011 A****e 1 9.91 9.91
3 172998011 J****m 1 9.91 9.91
4 173907435 M****n 1 14.99 14.99
What if we only care about bid distribution for a bidder when their largest bid is more than $20?
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: near "where": syntax error The above exception was the direct cause of the following exception: OperationalError Traceback (most recent call last) Cell In[16], line 8 1 q = """ 2 select itemid, biddername, count(*) as n_bids, min(bidamount) as min_bid, 3 max(bidamount) as max_bid (...) 6 where max_bid > 20 7 """ ----> 8 print(auctions.query(q).head()) Cell In[3], line 13, in DataBase.query(self, q) 11 """Run a query against the database and return a DataFrame""" 12 with Session(self.engine) as session: ---> 13 df = pd.read_sql(q, session.bind) 14 return(df) File ~/anaconda3/envs/website/lib/python3.12/site-packages/pandas/io/sql.py:734, in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize, dtype_backend, dtype) 724 return pandas_sql.read_table( 725 sql, 726 index_col=index_col, (...) 731 dtype_backend=dtype_backend, 732 ) 733 else: --> 734 return pandas_sql.read_query( 735 sql, 736 index_col=index_col, 737 params=params, 738 coerce_float=coerce_float, 739 parse_dates=parse_dates, 740 chunksize=chunksize, 741 dtype_backend=dtype_backend, 742 dtype=dtype, 743 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/pandas/io/sql.py:1836, in SQLDatabase.read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize, dtype, dtype_backend) 1779 def read_query( 1780 self, 1781 sql: str, (...) 1788 dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", 1789 ) -> DataFrame | Iterator[DataFrame]: 1790 """ 1791 Read SQL query into a DataFrame. 1792 (...) 1834 1835 """ -> 1836 result = self.execute(sql, params) 1837 columns = result.keys() 1839 if chunksize is not None: File ~/anaconda3/envs/website/lib/python3.12/site-packages/pandas/io/sql.py:1659, in SQLDatabase.execute(self, sql, params) 1657 args = [] if params is None else [params] 1658 if isinstance(sql, str): -> 1659 return self.con.exec_driver_sql(sql, *args) 1660 return self.con.execute(sql, *args) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1775, in Connection.exec_driver_sql(self, statement, parameters, execution_options) 1770 execution_options = self._execution_options.merge_with( 1771 execution_options 1772 ) 1774 dialect = self.dialect -> 1775 ret = self._execute_context( 1776 dialect, 1777 dialect.execution_ctx_cls._init_statement, 1778 statement, 1779 None, 1780 execution_options, 1781 statement, 1782 distilled_parameters, 1783 ) 1785 return ret File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1842, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw) 1840 return self._exec_insertmany_context(dialect, context) 1841 else: -> 1842 return self._exec_single_context( 1843 dialect, context, statement, parameters 1844 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1982, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1979 result = context._setup_result_proxy() 1981 except BaseException as e: -> 1982 self._handle_dbapi_exception( 1983 e, str_statement, effective_parameters, cursor, context 1984 ) 1986 return result File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:2351, in Connection._handle_dbapi_exception(self, e, statement, parameters, cursor, context, is_sub_exec) 2349 elif should_wrap: 2350 assert sqlalchemy_exception is not None -> 2351 raise sqlalchemy_exception.with_traceback(exc_info[2]) from e 2352 else: 2353 assert exc_info[1] is not None File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1961 break 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: 1968 self.dispatch.after_cursor_execute( 1969 self, 1970 cursor, (...) 1974 context.executemany, 1975 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: (sqlite3.OperationalError) near "where": syntax error [SQL: select itemid, biddername, count(*) as n_bids, min(bidamount) as min_bid, max(bidamount) as max_bid from bids group by itemid, biddername where max_bid > 20 ] (Background on this error at: https://sqlalche.me/e/20/e3q8)
HAVING
If we want to filter on the aggregate function value, we need to use HAVING
instead of WHERE
itemId bidderName n_bids min_bid max_bid
0 174767945 C****2 3 24.44 34.00
1 174767945 b****z 4 25.00 33.00
2 174871788 J****3 1 21.00 21.00
3 174871788 v****l 3 15.00 22.00
4 174901466 c****8 1 39.99 39.99
ORDER BY
Sorting works in an intuitive way
itemId bidderName n_bids min_bid max_bid
0 180573534 j****a 1 301.0 301.0
1 180573534 A****3 4 140.0 300.0
2 180601736 c****c 4 180.0 201.0
3 180601736 A****8 2 150.0 200.0
4 180601736 B****a 1 160.0 160.0
LIMIT
We’ve been asking for the head of our DataFrame
to limit output – we can do this directly in the query:
itemId bidderName n_bids min_bid max_bid
0 180573534 j****a 1 301.0 301.0
In our sample, how many bidders participate in multiple auctions? And how many auctions do they participate in?
60
We’ll see soon that we could also do this with a “subquery”
count(*)
0 60
OVER
If we want to compute operations by group and assign it as a new variable, we need to tell SQL how to organize the groups:
itemId min_bid itemPrice
0 165561698 9.91 9.91
1 170983900 9.91 9.91
2 172998011 9.91 9.91
3 172998011 9.91 9.91
4 173907435 14.99 14.99
LAG
Window functions are particularly useful if we need to lag data in SQL
itemId min_bid itemPrice lagged_price
0 165561698 9.91 9.91 NaN
1 170983900 9.91 9.91 NaN
2 172998011 9.91 9.91 NaN
3 172998011 9.91 9.91 9.91
4 173907435 14.99 14.99 NaN
String concatenation in SQL is performed with ||
title itemId \
0 Womens Size M The North Face Zip Up Jacket 179353985
1 The North Face Women's Size 4 Tan/Khaki Lightw... 177087535
2 The North Face Womens Pink Long Sleeve Mock Ne... 180876361
3 The North Face Women's Medium Sweaters/Shirt L... 177763109
4 The North Face Mens Red Flat Front Slash Pocke... 179660197
full_description
0 Womens Size M The North Face Zip Up Jacket <p>...
1 The North Face Women's Size 4 Tan/Khaki Lightw...
2 The North Face Womens Pink Long Sleeve Mock Ne...
3 The North Face Women's Medium Sweaters/Shirt L...
4 The North Face Mens Red Flat Front Slash Pocke...
itemId currentPrice shipping final_price
0 179353985 10.99 0 10.99
1 177087535 24.98 0 24.98
2 180876361 19.99 0 19.99
3 177763109 15.00 0 15.00
4 179660197 12.99 0 12.99
CASE WHEN
SQL’s if-else statement (similar to R
’s ifelse
or case_when
verbs)
itemId currentPrice shipping final_price
0 176705357 19.99 2 21.99
1 179025543 14.99 2 16.99
2 179353985 10.99 0 15.99
3 177087535 24.98 0 29.98
4 180876361 19.99 0 24.99
We can use LIKE
to pattern match – %
means zero, one, or multiple characters (this is a bad application – why?)
itemId currentPrice size
0 177087535 24.98 small
1 180876361 19.99 small
2 177763109 15.00 large
3 179660197 12.99 small
4 176601978 9.99 large
execute
method that facilitates this for our engine
from sqlalchemy import text
class DataBase:
def __init__(self, loc: str, db_type: str = "sqlite") -> None:
"""Initialize the class and connect to the database"""
self.loc = loc
self.db_type = db_type
self.engine = create_engine(f'{self.db_type}:///{self.loc}')
def query(self, q: str) -> pd.DataFrame:
"""Run a query against the database and return a DataFrame"""
with Session(self.engine) as session:
df = pd.read_sql(q, session.bind)
return(df)
def execute(self, q: str) -> None:
"""Execute statement on the database"""
with self.engine.connect() as conn:
conn.execute(text(q))
auctions = DataBase(path)
If we want to create a new table that contains only observations with bids where the buy now option wasn’t used, we can execute
a statement to do so.
index buyerCountry buyerCountryCode buyerState buyerStreet buyerZip \
0 12100 None US None None None
categoryParentList defaultShippingResults \
0 10|Clothing|27|Women's Clothing|154|Outerwear None
description \
0 <p>The North Face Womens Pink Long Sleeve Mock...
imageServer ... \
0 https://shopgoodwillimages.azureedge.net/produ... ...
bidTime quantity:1 bidIPAddress adCode serverIP \
0 2023-10-18 05:54:55.327000 1 None None None
retracted bidderName highBidderName isBuyerHighBidder isLoggedInBuyer
0 0 B****4 B****4 0 0
[1 rows x 100 columns]
Why do we need the first statement? Because SQL won’t let us create a table that already has a given name
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: table full_data already exists The above exception was the direct cause of the following exception: OperationalError Traceback (most recent call last) Cell In[31], line 5 1 q = """ 2 create table full_data as 3 select * from items 4 """ ----> 5 auctions.execute(q) Cell In[29], line 17, in DataBase.execute(self, q) 15 """Execute statement on the database""" 16 with self.engine.connect() as conn: ---> 17 conn.execute(text(q)) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1415, in Connection.execute(self, statement, parameters, execution_options) 1413 raise exc.ObjectNotExecutableError(statement) from err 1414 else: -> 1415 return meth( 1416 self, 1417 distilled_parameters, 1418 execution_options or NO_OPTIONS, 1419 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/sql/elements.py:523, in ClauseElement._execute_on_connection(self, connection, distilled_params, execution_options) 521 if TYPE_CHECKING: 522 assert isinstance(self, Executable) --> 523 return connection._execute_clauseelement( 524 self, distilled_params, execution_options 525 ) 526 else: 527 raise exc.ObjectNotExecutableError(self) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1637, in Connection._execute_clauseelement(self, elem, distilled_parameters, execution_options) 1625 compiled_cache: Optional[CompiledCacheType] = execution_options.get( 1626 "compiled_cache", self.engine._compiled_cache 1627 ) 1629 compiled_sql, extracted_params, cache_hit = elem._compile_w_cache( 1630 dialect=dialect, 1631 compiled_cache=compiled_cache, (...) 1635 linting=self.dialect.compiler_linting | compiler.WARN_LINTING, 1636 ) -> 1637 ret = self._execute_context( 1638 dialect, 1639 dialect.execution_ctx_cls._init_compiled, 1640 compiled_sql, 1641 distilled_parameters, 1642 execution_options, 1643 compiled_sql, 1644 distilled_parameters, 1645 elem, 1646 extracted_params, 1647 cache_hit=cache_hit, 1648 ) 1649 if has_events: 1650 self.dispatch.after_execute( 1651 self, 1652 elem, (...) 1656 ret, 1657 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1842, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw) 1840 return self._exec_insertmany_context(dialect, context) 1841 else: -> 1842 return self._exec_single_context( 1843 dialect, context, statement, parameters 1844 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1982, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1979 result = context._setup_result_proxy() 1981 except BaseException as e: -> 1982 self._handle_dbapi_exception( 1983 e, str_statement, effective_parameters, cursor, context 1984 ) 1986 return result File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:2351, in Connection._handle_dbapi_exception(self, e, statement, parameters, cursor, context, is_sub_exec) 2349 elif should_wrap: 2350 assert sqlalchemy_exception is not None -> 2351 raise sqlalchemy_exception.with_traceback(exc_info[2]) from e 2352 else: 2353 assert exc_info[1] is not None File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1961 break 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: 1968 self.dispatch.after_cursor_execute( 1969 self, 1970 cursor, (...) 1974 context.executemany, 1975 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: (sqlite3.OperationalError) table full_data already exists [SQL: create table full_data as select * from items ] (Background on this error at: https://sqlalche.me/e/20/e3q8)
index buyerCountry buyerCountryCode buyerState buyerStreet buyerZip \
0 12100 None US None None None
categoryParentList defaultShippingResults \
0 10|Clothing|27|Women's Clothing|154|Outerwear None
description \
0 <p>The North Face Womens Pink Long Sleeve Mock...
imageServer ... \
0 https://shopgoodwillimages.azureedge.net/produ... ...
bidTime quantity:1 bidIPAddress adCode serverIP \
0 2023-10-18 05:54:55.327000 1 None None None
retracted bidderName highBidderName isBuyerHighBidder isLoggedInBuyer
0 0 B****4 B****4 0 0
[1 rows x 100 columns]
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: no such table: full_data The above exception was the direct cause of the following exception: OperationalError Traceback (most recent call last) Cell In[33], line 2 1 auctions = DataBase(path) ----> 2 print(auctions.query("select * from full_data limit 1")) Cell In[29], line 12, in DataBase.query(self, q) 10 """Run a query against the database and return a DataFrame""" 11 with Session(self.engine) as session: ---> 12 df = pd.read_sql(q, session.bind) 13 return(df) File ~/anaconda3/envs/website/lib/python3.12/site-packages/pandas/io/sql.py:734, in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize, dtype_backend, dtype) 724 return pandas_sql.read_table( 725 sql, 726 index_col=index_col, (...) 731 dtype_backend=dtype_backend, 732 ) 733 else: --> 734 return pandas_sql.read_query( 735 sql, 736 index_col=index_col, 737 params=params, 738 coerce_float=coerce_float, 739 parse_dates=parse_dates, 740 chunksize=chunksize, 741 dtype_backend=dtype_backend, 742 dtype=dtype, 743 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/pandas/io/sql.py:1836, in SQLDatabase.read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize, dtype, dtype_backend) 1779 def read_query( 1780 self, 1781 sql: str, (...) 1788 dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", 1789 ) -> DataFrame | Iterator[DataFrame]: 1790 """ 1791 Read SQL query into a DataFrame. 1792 (...) 1834 1835 """ -> 1836 result = self.execute(sql, params) 1837 columns = result.keys() 1839 if chunksize is not None: File ~/anaconda3/envs/website/lib/python3.12/site-packages/pandas/io/sql.py:1659, in SQLDatabase.execute(self, sql, params) 1657 args = [] if params is None else [params] 1658 if isinstance(sql, str): -> 1659 return self.con.exec_driver_sql(sql, *args) 1660 return self.con.execute(sql, *args) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1775, in Connection.exec_driver_sql(self, statement, parameters, execution_options) 1770 execution_options = self._execution_options.merge_with( 1771 execution_options 1772 ) 1774 dialect = self.dialect -> 1775 ret = self._execute_context( 1776 dialect, 1777 dialect.execution_ctx_cls._init_statement, 1778 statement, 1779 None, 1780 execution_options, 1781 statement, 1782 distilled_parameters, 1783 ) 1785 return ret File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1842, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw) 1840 return self._exec_insertmany_context(dialect, context) 1841 else: -> 1842 return self._exec_single_context( 1843 dialect, context, statement, parameters 1844 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1982, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1979 result = context._setup_result_proxy() 1981 except BaseException as e: -> 1982 self._handle_dbapi_exception( 1983 e, str_statement, effective_parameters, cursor, context 1984 ) 1986 return result File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:2351, in Connection._handle_dbapi_exception(self, e, statement, parameters, cursor, context, is_sub_exec) 2349 elif should_wrap: 2350 assert sqlalchemy_exception is not None -> 2351 raise sqlalchemy_exception.with_traceback(exc_info[2]) from e 2352 else: 2353 assert exc_info[1] is not None File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1961 break 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: 1968 self.dispatch.after_cursor_execute( 1969 self, 1970 cursor, (...) 1974 context.executemany, 1975 ) File ~/anaconda3/envs/website/lib/python3.12/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: (sqlite3.OperationalError) no such table: full_data [SQL: select * from full_data limit 1] (Background on this error at: https://sqlalche.me/e/20/e3q8)
For each bid, express its time as relative to when the auction ended (endtime
). That means that if an auction was 10 hours long (as measured by endtime
- starttime
) and a bid was placed an hour before the auction ended, it would have a normalized timestamp of .1. Plot this distribution as a histogram.
Hint: to compute the difference in time between two dates, use julianday(time1)-julianday(time2)
.
itemId startTime endTime length
0 179353985 2023-09-28 17:00:54.000000 2023-10-02 18:14:00.000000 4.050764
1 177087535 2023-09-04 22:54:00.000000 2023-09-12 19:46:00.000000 7.869444
2 180876361 2023-10-14 03:18:40.000000 2023-10-19 04:04:40.000000 5.031944
3 177763109 2023-09-12 08:22:45.000000 2023-09-17 18:34:00.000000 5.424479
itemId bidTime startTime \
0 178348858 2023-09-18 16:11:04.587000 2023-09-18 14:29:56.000000
1 178348858 2023-09-22 14:22:06.700000 2023-09-18 14:29:56.000000
2 178348858 2023-09-23 12:35:18.157000 2023-09-18 14:29:56.000000
3 178348858 2023-09-23 18:23:27.993000 2023-09-18 14:29:56.000000
4 178348858 2023-09-23 18:37:47.213000 2023-09-18 14:29:56.000000
endTime time_norm
0 2023-09-23 18:39:00.000000 0.986422
1 2023-09-23 18:39:00.000000 0.227799
2 2023-09-23 18:39:00.000000 0.048825
3 2023-09-23 18:39:00.000000 0.002085
4 2023-09-23 18:39:00.000000 0.000163
What does this tell us? Are these results surprising?
q = """
select b.itemid, b.bidtime, a.starttime, a.endtime,
(julianday(endtime)-julianday(bidtime)) / a.length as time_norm
from bids as b
inner join (
select itemid, starttime, endtime,
julianday(endtime) - julianday(starttime) as length
from items
) as a
on b.itemid=a.itemid
"""
df = auctions.query(q)
print(df.head(2))
itemId bidTime starttime \
0 178348858 2023-09-18 16:11:04.587000 2023-09-18 14:29:56.000000
1 178348858 2023-09-22 14:22:06.700000 2023-09-18 14:29:56.000000
endtime time_norm
0 2023-09-23 18:39:00.000000 0.986422
1 2023-09-23 18:39:00.000000 0.227799
Using WITH
improves readability
q = """
with a as (
select itemid, starttime, endtime,
julianday(endtime) - julianday(starttime) as length
from items
)
select b.itemid, b.bidtime, a.starttime, a.endtime,
(julianday(endtime)-julianday(bidtime)) / a.length as time_norm
from bids as b
inner join a
on b.itemid=a.itemid
"""
df = auctions.query(q)
print(df.head(2))
itemId bidTime starttime \
0 178348858 2023-09-18 16:11:04.587000 2023-09-18 14:29:56.000000
1 178348858 2023-09-22 14:22:06.700000 2023-09-18 14:29:56.000000
endtime time_norm
0 2023-09-23 18:39:00.000000 0.986422
1 2023-09-23 18:39:00.000000 0.227799
No unified linting tools such as pylint
for python
Always assume that the code you write today will be inherited by a murderous psychopath who knows where you live!
not
not
Why put the comma first?
Align column names with manual spaces
If nesting subqueries, use consistent indentation
Additional suggestions:
sqlite
DatabaseIf you have CSV files, you can create a database like this:
create_db.py
engine = create_engine("sqlite:////Users/hlukas/git/personal_website/static/econ-481/data/auctions.db")
bids = pd.read_csv('/Users/hlukas/Google Drive/Raw Data/goodwill/final_data/the north face/bidding_data.csv')
items = pd.read_csv('/Users/hlukas/Google Drive/Raw Data/goodwill/final_data/the north face/item_data.csv')
items_small = items.sample(500)
bids_small = bids.loc[bids['itemId'].isin(items_small['itemId'])]
bids_small.to_sql(con=engine, name='bids', if_exists='replace')
items_small.to_sql(con=engine, name='items', if_exists='replace')
update_db.py
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine(f'sqlite:///{path}')
Base = declarative_base()
Base.metadata.create_all(engine)
items = pd.read_csv('/Users/hlukas/Google Drive/Raw Data/goodwill/final_data/the north face/item_data.csv')
items_small = items.sample(500)
items_small.to_sql(con=engine, name='items', if_exists='append')
sqlalchemy
We don’t really need to write SQL if we don’t want to to use the package:
from sqlalchemy import MetaData, Table, select
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine(f'sqlite:///{path}')
Base = declarative_base()
Base.metadata.reflect(engine)
bids = Base.metadata.tables['bids']
query = select(bids.c.itemId, bids.c.bidAmount)\
.where(bids.c.bidAmount==10)\
.limit(5)
with Session(engine) as s:
print(pd.DataFrame(s.execute(query)))
itemId bidAmount
0 177106026 10.0
1 177963226 10.0
2 178438915 10.0
3 181622497 10.0
4 179719241 10.0
itemId n_bidders
0 165561698 1
1 170983900 1
2 172998011 2
3 173907435 1
4 174445924 3
.. ... ...
167 182760698 1
168 182777527 1
169 182883873 2
170 182894197 1
171 182925212 2
[172 rows x 2 columns]
[1,2,3,4,5]
\[ 5x \geq x + t \iff t \leq 5 \]