2025-07-18
“Sequel”, not “Ess-Queue-Ell”
pandas treats tables as DataFramespandas etc.Why should you use databases instead of CSVs?
sqlalchemysqlite database using sqlalchemycreate_engine.db file called auctions.db that contains data on bidding for 500 North Face clothing items on ShopGoodwill.com1sqlalchemy engine:create_engine Argumentsqlalchemy what sort of database we’re connecting to/ characters before the database location.['bids', 'items']
So we have two tables, named “bids” and “items”
sqlalchemy or pyspark have methods to take the place of queryingTo assess the output of our queries, we’re going to write a class that will run our query against the database and return a DataFrame as the table output.
import pandas as pd
from sqlalchemy.orm import Session
class DataBase:
    def __init__(self, loc: str, db_type: str = "sqlite") -> None:
        """Initialize the class and connect to the database"""
        self.loc = loc
        self.db_type = db_type
        self.engine = create_engine(f'{self.db_type}:///{self.loc}')
    def query(self, q: str) -> pd.DataFrame:
        """Run a query against the database and return a DataFrame"""
        with Session(self.engine) as session:
            df = pd.read_sql(q, session.bind)
        return(df)
auctions = DataBase(path)SELECT * Statement1   index  bidLogId     itemId  itemPrice  bidAmount  \
0     50         0  178348858       9.99       20.0   
1     51         0  178348858      13.00       12.0   
2     52         0  178348858      21.00       23.0   
3     53         0  178348858      24.00       35.0   
4     54         0  178348858      36.00       48.0   
                      bidTime  quantity bidIPAddress adCode serverIP  \
0  2023-09-18 16:11:04.587000         1         None   None     None   
1  2023-09-22 14:22:06.700000         1         None   None     None   
2  2023-09-23 12:35:18.157000         1         None   None     None   
3  2023-09-23 18:23:27.993000         1         None   None     None   
4  2023-09-23 18:37:47.213000         1         None   None     None   
   retracted bidderName highBidderName  isBuyerHighBidder  isLoggedInBuyer  
0          0     a****9         a****9                  0                0  
1          0     S****p         a****9                  0                0  
2          0     H****2         H****2                  0                0  
3          0     b****e         b****e                  0                0  
4          0     t****5         t****5                  0                0  
SELECT Columns Statement      itemId                                        description  isBuyNowUsed
0  179353985  <p><strong>Description:</strong></p>\n<p>Women...             0
1  177087535  <p><strong>Details & Condition</strong></p...             0
2  180876361  <p>The North Face Womens Pink Long Sleeve Mock...             0
3  177763109  <p><br></p><ul><li><span class="ql-size-large"...             0
4  179660197  <p><b>Title: </b>The North Face Mens Red Flat ...             0
JOIN StatementsRecall our discussion on joining in pandas – these are SQL-style joins, and SQL has the same types.
      itemId                                        description bidderName  \
0  179353985  <p><strong>Description:</strong></p>\n<p>Women...       None   
1  177087535  <p><strong>Details & Condition</strong></p...       None   
2  180876361  <p>The North Face Womens Pink Long Sleeve Mock...     B****4   
3  177763109  <p><br></p><ul><li><span class="ql-size-large"...     n****m   
4  177763109  <p><br></p><ul><li><span class="ql-size-large"...     N****0   
   bidAmount                     bidTime  
0        NaN                        None  
1        NaN                        None  
2      19.99  2023-10-18 05:54:55.327000  
3      10.00  2023-09-17 11:52:27.447000  
4      14.00  2023-09-17 17:33:48.517000  
JOIN AliasesShould this run?
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: ambiguous column name: itemid The above exception was the direct cause of the following exception: OperationalError Traceback (most recent call last) Cell In[7], line 7 1 q = """ 2 select itemid, description, biddername, bidamount, bidtime 3 from items 4 left join bids 5 on items.itemid = bids.itemid 6 """ ----> 7 print(auctions.query(q).head()) Cell In[3], line 13, in DataBase.query(self, q) 11 """Run a query against the database and return a DataFrame""" 12 with Session(self.engine) as session: ---> 13 df = pd.read_sql(q, session.bind) 14 return(df) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/pandas/io/sql.py:734, in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize, dtype_backend, dtype) 724 return pandas_sql.read_table( 725 sql, 726 index_col=index_col, (...) 731 dtype_backend=dtype_backend, 732 ) 733 else: --> 734 return pandas_sql.read_query( 735 sql, 736 index_col=index_col, 737 params=params, 738 coerce_float=coerce_float, 739 parse_dates=parse_dates, 740 chunksize=chunksize, 741 dtype_backend=dtype_backend, 742 dtype=dtype, 743 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/pandas/io/sql.py:1836, in SQLDatabase.read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize, dtype, dtype_backend) 1779 def read_query( 1780 self, 1781 sql: str, (...) 1788 dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", 1789 ) -> DataFrame | Iterator[DataFrame]: 1790 """ 1791 Read SQL query into a DataFrame. 1792 (...) 1834 1835 """ -> 1836 result = self.execute(sql, params) 1837 columns = result.keys() 1839 if chunksize is not None: File /opt/anaconda3/envs/website/lib/python3.13/site-packages/pandas/io/sql.py:1659, in SQLDatabase.execute(self, sql, params) 1657 args = [] if params is None else [params] 1658 if isinstance(sql, str): -> 1659 return self.con.exec_driver_sql(sql, *args) 1660 return self.con.execute(sql, *args) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1775, in Connection.exec_driver_sql(self, statement, parameters, execution_options) 1770 execution_options = self._execution_options.merge_with( 1771 execution_options 1772 ) 1774 dialect = self.dialect -> 1775 ret = self._execute_context( 1776 dialect, 1777 dialect.execution_ctx_cls._init_statement, 1778 statement, 1779 None, 1780 execution_options, 1781 statement, 1782 distilled_parameters, 1783 ) 1785 return ret File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1842, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw) 1840 return self._exec_insertmany_context(dialect, context) 1841 else: -> 1842 return self._exec_single_context( 1843 dialect, context, statement, parameters 1844 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1982, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1979 result = context._setup_result_proxy() 1981 except BaseException as e: -> 1982 self._handle_dbapi_exception( 1983 e, str_statement, effective_parameters, cursor, context 1984 ) 1986 return result File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:2351, in Connection._handle_dbapi_exception(self, e, statement, parameters, cursor, context, is_sub_exec) 2349 elif should_wrap: 2350 assert sqlalchemy_exception is not None -> 2351 raise sqlalchemy_exception.with_traceback(exc_info[2]) from e 2352 else: 2353 assert exc_info[1] is not None File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1961 break 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: 1968 self.dispatch.after_cursor_execute( 1969 self, 1970 cursor, (...) 1974 context.executemany, 1975 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: (sqlite3.OperationalError) ambiguous column name: itemid [SQL: select itemid, description, biddername, bidamount, bidtime from items left join bids on items.itemid = bids.itemid ] (Background on this error at: https://sqlalche.me/e/20/e3q8)
JOIN Renaming TablesIt’s often convenient to rename tables in joins to make your query less verbose (potentially at the cost of readability)
      itemId                                        description bidderName  \
0  179353985  <p><strong>Description:</strong></p>\n<p>Women...       None   
1  177087535  <p><strong>Details & Condition</strong></p...       None   
2  180876361  <p>The North Face Womens Pink Long Sleeve Mock...     B****4   
3  177763109  <p><br></p><ul><li><span class="ql-size-large"...     n****m   
4  177763109  <p><br></p><ul><li><span class="ql-size-large"...     N****0   
   bidAmount                     bidTime  
0        NaN                        None  
1        NaN                        None  
2      19.99  2023-10-18 05:54:55.327000  
3      10.00  2023-09-17 11:52:27.447000  
4      14.00  2023-09-17 17:33:48.517000  
For each of the join types supported in sqlite (left, inner, cross), perform the join on the two tables and report the number of observations in the resulting join.
[551, 879, 551]
WHERE      itemId                                        description bidderName  \
0  178348858  <p><br></p><ul><li><span class="ql-size-large"...     a****9   
1  178348858  <p><br></p><ul><li><span class="ql-size-large"...     S****p   
2  178348858  <p><br></p><ul><li><span class="ql-size-large"...     H****2   
3  178348858  <p><br></p><ul><li><span class="ql-size-large"...     b****e   
4  178348858  <p><br></p><ul><li><span class="ql-size-large"...     t****5   
   bidAmount                     bidTime  
0       20.0  2023-09-18 16:11:04.587000  
1       12.0  2023-09-22 14:22:06.700000  
2       23.0  2023-09-23 12:35:18.157000  
3       35.0  2023-09-23 18:23:27.993000  
4       48.0  2023-09-23 18:37:47.213000  
WHERE With Multiple Conditions      itemId                                        description bidderName  \
0  180876361  <p>The North Face Womens Pink Long Sleeve Mock...     B****4   
1  177763109  <p><br></p><ul><li><span class="ql-size-large"...     n****m   
2  177763109  <p><br></p><ul><li><span class="ql-size-large"...     N****0   
3  177763109  <p><br></p><ul><li><span class="ql-size-large"...     M****y   
4  177763109  <p><br></p><ul><li><span class="ql-size-large"...     M****y   
   bidAmount                     bidTime  
0      19.99  2023-10-18 05:54:55.327000  
1      10.00  2023-09-17 11:52:27.447000  
2      14.00  2023-09-17 17:33:48.517000  
3      15.00  2023-09-17 18:27:00.087000  
4      15.00  2023-09-17 18:33:37.233000  
GROUP BYThe same as .groupby() in pandas – add aggregating functions to the SELECT clause
      itemId  n_bidders
0  165561698          1
1  170983900          1
2  172998011          2
3  173907435          1
4  174445924          3
COUNTWe can also just count observations without a grouping:
   count(*)
0       500
MIN and MAXIn SQL, MIN and MAX are aggregating functions that work the same way as COUNT. Use them to create a table of the number of bids each bidder submitted for each item, as well as their largest and smallest bid.
MIN and MAX      itemId bidderName  n_bids  min_bid  max_bid
0  165561698     n****4       1     9.91     9.91
1  170983900     c****3       1     9.91     9.91
2  172998011     A****e       1     9.91     9.91
3  172998011     J****m       1     9.91     9.91
4  173907435     M****n       1    14.99    14.99
What if we only care about bid distribution for a bidder when their largest bid is more than $20?
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: near "where": syntax error The above exception was the direct cause of the following exception: OperationalError Traceback (most recent call last) Cell In[16], line 8 1 q = """ 2 select itemid, biddername, count(*) as n_bids, min(bidamount) as min_bid, 3 max(bidamount) as max_bid (...) 6 where max_bid > 20 7 """ ----> 8 print(auctions.query(q).head()) Cell In[3], line 13, in DataBase.query(self, q) 11 """Run a query against the database and return a DataFrame""" 12 with Session(self.engine) as session: ---> 13 df = pd.read_sql(q, session.bind) 14 return(df) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/pandas/io/sql.py:734, in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize, dtype_backend, dtype) 724 return pandas_sql.read_table( 725 sql, 726 index_col=index_col, (...) 731 dtype_backend=dtype_backend, 732 ) 733 else: --> 734 return pandas_sql.read_query( 735 sql, 736 index_col=index_col, 737 params=params, 738 coerce_float=coerce_float, 739 parse_dates=parse_dates, 740 chunksize=chunksize, 741 dtype_backend=dtype_backend, 742 dtype=dtype, 743 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/pandas/io/sql.py:1836, in SQLDatabase.read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize, dtype, dtype_backend) 1779 def read_query( 1780 self, 1781 sql: str, (...) 1788 dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", 1789 ) -> DataFrame | Iterator[DataFrame]: 1790 """ 1791 Read SQL query into a DataFrame. 1792 (...) 1834 1835 """ -> 1836 result = self.execute(sql, params) 1837 columns = result.keys() 1839 if chunksize is not None: File /opt/anaconda3/envs/website/lib/python3.13/site-packages/pandas/io/sql.py:1659, in SQLDatabase.execute(self, sql, params) 1657 args = [] if params is None else [params] 1658 if isinstance(sql, str): -> 1659 return self.con.exec_driver_sql(sql, *args) 1660 return self.con.execute(sql, *args) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1775, in Connection.exec_driver_sql(self, statement, parameters, execution_options) 1770 execution_options = self._execution_options.merge_with( 1771 execution_options 1772 ) 1774 dialect = self.dialect -> 1775 ret = self._execute_context( 1776 dialect, 1777 dialect.execution_ctx_cls._init_statement, 1778 statement, 1779 None, 1780 execution_options, 1781 statement, 1782 distilled_parameters, 1783 ) 1785 return ret File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1842, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw) 1840 return self._exec_insertmany_context(dialect, context) 1841 else: -> 1842 return self._exec_single_context( 1843 dialect, context, statement, parameters 1844 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1982, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1979 result = context._setup_result_proxy() 1981 except BaseException as e: -> 1982 self._handle_dbapi_exception( 1983 e, str_statement, effective_parameters, cursor, context 1984 ) 1986 return result File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:2351, in Connection._handle_dbapi_exception(self, e, statement, parameters, cursor, context, is_sub_exec) 2349 elif should_wrap: 2350 assert sqlalchemy_exception is not None -> 2351 raise sqlalchemy_exception.with_traceback(exc_info[2]) from e 2352 else: 2353 assert exc_info[1] is not None File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1961 break 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: 1968 self.dispatch.after_cursor_execute( 1969 self, 1970 cursor, (...) 1974 context.executemany, 1975 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: (sqlite3.OperationalError) near "where": syntax error [SQL: select itemid, biddername, count(*) as n_bids, min(bidamount) as min_bid, max(bidamount) as max_bid from bids group by itemid, biddername where max_bid > 20 ] (Background on this error at: https://sqlalche.me/e/20/e3q8)
HAVINGIf we want to filter on the aggregate function value, we need to use HAVING instead of WHERE
      itemId bidderName  n_bids  min_bid  max_bid
0  174767945     C****2       3    24.44    34.00
1  174767945     b****z       4    25.00    33.00
2  174871788     J****3       1    21.00    21.00
3  174871788     v****l       3    15.00    22.00
4  174901466     c****8       1    39.99    39.99
ORDER BYSorting works in an intuitive way
      itemId bidderName  n_bids  min_bid  max_bid
0  180573534     j****a       1    301.0    301.0
1  180573534     A****3       4    140.0    300.0
2  180601736     c****c       4    180.0    201.0
3  180601736     A****8       2    150.0    200.0
4  180601736     B****a       1    160.0    160.0
LIMITWe’ve been asking for the head of our DataFrame to limit output – we can do this directly in the query:
      itemId bidderName  n_bids  min_bid  max_bid
0  180573534     j****a       1    301.0    301.0
In our sample, how many bidders participate in multiple auctions? And how many auctions do they participate in?
60
We’ll see soon that we could also do this with a “subquery”
   count(*)
0        60
OVERIf we want to compute operations by group and assign it as a new variable, we need to tell SQL how to organize the groups:
      itemId  min_bid  itemPrice
0  165561698     9.91       9.91
1  170983900     9.91       9.91
2  172998011     9.91       9.91
3  172998011     9.91       9.91
4  173907435    14.99      14.99
LAGWindow functions are particularly useful if we need to lag data in SQL
      itemId  min_bid  itemPrice  lagged_price
0  165561698     9.91       9.91           NaN
1  170983900     9.91       9.91           NaN
2  172998011     9.91       9.91           NaN
3  172998011     9.91       9.91          9.91
4  173907435    14.99      14.99           NaN
String concatenation in SQL is performed with ||
                                               title     itemId  \
0         Womens Size M The North Face Zip Up Jacket  179353985   
1  The North Face Women's Size 4 Tan/Khaki Lightw...  177087535   
2  The North Face Womens Pink Long Sleeve Mock Ne...  180876361   
3  The North Face Women's Medium Sweaters/Shirt L...  177763109   
4  The North Face Mens Red Flat Front Slash Pocke...  179660197   
                                    full_description  
0  Womens Size M The North Face Zip Up Jacket <p>...  
1  The North Face Women's Size 4 Tan/Khaki Lightw...  
2  The North Face Womens Pink Long Sleeve Mock Ne...  
3  The North Face Women's Medium Sweaters/Shirt L...  
4  The North Face Mens Red Flat Front Slash Pocke...  
      itemId  currentPrice  shipping  final_price
0  179353985         10.99         0        10.99
1  177087535         24.98         0        24.98
2  180876361         19.99         0        19.99
3  177763109         15.00         0        15.00
4  179660197         12.99         0        12.99
CASE WHENSQL’s if-else statement (similar to R’s ifelse or case_when verbs)
      itemId  currentPrice  shipping  final_price
0  176705357         19.99         2        21.99
1  179025543         14.99         2        16.99
2  179353985         10.99         0        15.99
3  177087535         24.98         0        29.98
4  180876361         19.99         0        24.99
We can use LIKE to pattern match – % means zero, one, or multiple characters (this is a bad application – why?)
      itemId  currentPrice   size
0  177087535         24.98  small
1  180876361         19.99  small
2  177763109         15.00  large
3  179660197         12.99  small
4  176601978          9.99  large
execute method that facilitates this for our enginefrom sqlalchemy import text
class DataBase:
    def __init__(self, loc: str, db_type: str = "sqlite") -> None:
        """Initialize the class and connect to the database"""
        self.loc = loc
        self.db_type = db_type
        self.engine = create_engine(f'{self.db_type}:///{self.loc}')
    def query(self, q: str) -> pd.DataFrame:
        """Run a query against the database and return a DataFrame"""
        with Session(self.engine) as session:
            df = pd.read_sql(q, session.bind)
        return(df)
    def execute(self, q: str) -> None:
        """Execute statement on the database"""
        with self.engine.connect() as conn:
            conn.execute(text(q))
auctions = DataBase(path)If we want to create a new table that contains only observations with bids where the buy now option wasn’t used, we can execute a statement to do so.
   index buyerCountry buyerCountryCode buyerState buyerStreet buyerZip  \
0  12100         None               US       None        None     None   
                              categoryParentList defaultShippingResults  \
0  10|Clothing|27|Women's Clothing|154|Outerwear                   None   
                                         description  \
0  <p>The North Face Womens Pink Long Sleeve Mock...   
                                         imageServer  ...  \
0  https://shopgoodwillimages.azureedge.net/produ...  ...   
                      bidTime quantity:1 bidIPAddress adCode serverIP  \
0  2023-10-18 05:54:55.327000          1         None   None     None   
  retracted bidderName highBidderName isBuyerHighBidder isLoggedInBuyer  
0         0     B****4         B****4                 0               0  
[1 rows x 100 columns]
Why do we need the first statement? Because SQL won’t let us create a table that already has a given name
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: table full_data already exists The above exception was the direct cause of the following exception: OperationalError Traceback (most recent call last) Cell In[31], line 5 1 q = """ 2 create table full_data as 3 select * from items 4 """ ----> 5 auctions.execute(q) Cell In[29], line 17, in DataBase.execute(self, q) 15 """Execute statement on the database""" 16 with self.engine.connect() as conn: ---> 17 conn.execute(text(q)) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1415, in Connection.execute(self, statement, parameters, execution_options) 1413 raise exc.ObjectNotExecutableError(statement) from err 1414 else: -> 1415 return meth( 1416 self, 1417 distilled_parameters, 1418 execution_options or NO_OPTIONS, 1419 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/sql/elements.py:523, in ClauseElement._execute_on_connection(self, connection, distilled_params, execution_options) 521 if TYPE_CHECKING: 522 assert isinstance(self, Executable) --> 523 return connection._execute_clauseelement( 524 self, distilled_params, execution_options 525 ) 526 else: 527 raise exc.ObjectNotExecutableError(self) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1637, in Connection._execute_clauseelement(self, elem, distilled_parameters, execution_options) 1625 compiled_cache: Optional[CompiledCacheType] = execution_options.get( 1626 "compiled_cache", self.engine._compiled_cache 1627 ) 1629 compiled_sql, extracted_params, cache_hit = elem._compile_w_cache( 1630 dialect=dialect, 1631 compiled_cache=compiled_cache, (...) 1635 linting=self.dialect.compiler_linting | compiler.WARN_LINTING, 1636 ) -> 1637 ret = self._execute_context( 1638 dialect, 1639 dialect.execution_ctx_cls._init_compiled, 1640 compiled_sql, 1641 distilled_parameters, 1642 execution_options, 1643 compiled_sql, 1644 distilled_parameters, 1645 elem, 1646 extracted_params, 1647 cache_hit=cache_hit, 1648 ) 1649 if has_events: 1650 self.dispatch.after_execute( 1651 self, 1652 elem, (...) 1656 ret, 1657 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1842, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw) 1840 return self._exec_insertmany_context(dialect, context) 1841 else: -> 1842 return self._exec_single_context( 1843 dialect, context, statement, parameters 1844 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1982, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1979 result = context._setup_result_proxy() 1981 except BaseException as e: -> 1982 self._handle_dbapi_exception( 1983 e, str_statement, effective_parameters, cursor, context 1984 ) 1986 return result File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:2351, in Connection._handle_dbapi_exception(self, e, statement, parameters, cursor, context, is_sub_exec) 2349 elif should_wrap: 2350 assert sqlalchemy_exception is not None -> 2351 raise sqlalchemy_exception.with_traceback(exc_info[2]) from e 2352 else: 2353 assert exc_info[1] is not None File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1961 break 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: 1968 self.dispatch.after_cursor_execute( 1969 self, 1970 cursor, (...) 1974 context.executemany, 1975 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: (sqlite3.OperationalError) table full_data already exists [SQL: create table full_data as select * from items ] (Background on this error at: https://sqlalche.me/e/20/e3q8)
   index buyerCountry buyerCountryCode buyerState buyerStreet buyerZip  \
0  12100         None               US       None        None     None   
                              categoryParentList defaultShippingResults  \
0  10|Clothing|27|Women's Clothing|154|Outerwear                   None   
                                         description  \
0  <p>The North Face Womens Pink Long Sleeve Mock...   
                                         imageServer  ...  \
0  https://shopgoodwillimages.azureedge.net/produ...  ...   
                      bidTime quantity:1 bidIPAddress adCode serverIP  \
0  2023-10-18 05:54:55.327000          1         None   None     None   
  retracted bidderName highBidderName isBuyerHighBidder isLoggedInBuyer  
0         0     B****4         B****4                 0               0  
[1 rows x 100 columns]
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: no such table: full_data The above exception was the direct cause of the following exception: OperationalError Traceback (most recent call last) Cell In[33], line 2 1 auctions = DataBase(path) ----> 2 print(auctions.query("select * from full_data limit 1")) Cell In[29], line 12, in DataBase.query(self, q) 10 """Run a query against the database and return a DataFrame""" 11 with Session(self.engine) as session: ---> 12 df = pd.read_sql(q, session.bind) 13 return(df) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/pandas/io/sql.py:734, in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize, dtype_backend, dtype) 724 return pandas_sql.read_table( 725 sql, 726 index_col=index_col, (...) 731 dtype_backend=dtype_backend, 732 ) 733 else: --> 734 return pandas_sql.read_query( 735 sql, 736 index_col=index_col, 737 params=params, 738 coerce_float=coerce_float, 739 parse_dates=parse_dates, 740 chunksize=chunksize, 741 dtype_backend=dtype_backend, 742 dtype=dtype, 743 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/pandas/io/sql.py:1836, in SQLDatabase.read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize, dtype, dtype_backend) 1779 def read_query( 1780 self, 1781 sql: str, (...) 1788 dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", 1789 ) -> DataFrame | Iterator[DataFrame]: 1790 """ 1791 Read SQL query into a DataFrame. 1792 (...) 1834 1835 """ -> 1836 result = self.execute(sql, params) 1837 columns = result.keys() 1839 if chunksize is not None: File /opt/anaconda3/envs/website/lib/python3.13/site-packages/pandas/io/sql.py:1659, in SQLDatabase.execute(self, sql, params) 1657 args = [] if params is None else [params] 1658 if isinstance(sql, str): -> 1659 return self.con.exec_driver_sql(sql, *args) 1660 return self.con.execute(sql, *args) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1775, in Connection.exec_driver_sql(self, statement, parameters, execution_options) 1770 execution_options = self._execution_options.merge_with( 1771 execution_options 1772 ) 1774 dialect = self.dialect -> 1775 ret = self._execute_context( 1776 dialect, 1777 dialect.execution_ctx_cls._init_statement, 1778 statement, 1779 None, 1780 execution_options, 1781 statement, 1782 distilled_parameters, 1783 ) 1785 return ret File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1842, in Connection._execute_context(self, dialect, constructor, statement, parameters, execution_options, *args, **kw) 1840 return self._exec_insertmany_context(dialect, context) 1841 else: -> 1842 return self._exec_single_context( 1843 dialect, context, statement, parameters 1844 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1982, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1979 result = context._setup_result_proxy() 1981 except BaseException as e: -> 1982 self._handle_dbapi_exception( 1983 e, str_statement, effective_parameters, cursor, context 1984 ) 1986 return result File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:2351, in Connection._handle_dbapi_exception(self, e, statement, parameters, cursor, context, is_sub_exec) 2349 elif should_wrap: 2350 assert sqlalchemy_exception is not None -> 2351 raise sqlalchemy_exception.with_traceback(exc_info[2]) from e 2352 else: 2353 assert exc_info[1] is not None File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/base.py:1963, in Connection._exec_single_context(self, dialect, context, statement, parameters) 1961 break 1962 if not evt_handled: -> 1963 self.dialect.do_execute( 1964 cursor, str_statement, effective_parameters, context 1965 ) 1967 if self._has_events or self.engine._has_events: 1968 self.dispatch.after_cursor_execute( 1969 self, 1970 cursor, (...) 1974 context.executemany, 1975 ) File /opt/anaconda3/envs/website/lib/python3.13/site-packages/sqlalchemy/engine/default.py:943, in DefaultDialect.do_execute(self, cursor, statement, parameters, context) 942 def do_execute(self, cursor, statement, parameters, context=None): --> 943 cursor.execute(statement, parameters) OperationalError: (sqlite3.OperationalError) no such table: full_data [SQL: select * from full_data limit 1] (Background on this error at: https://sqlalche.me/e/20/e3q8)
For each bid, express its time as relative to when the auction ended (endtime). That means that if an auction was 10 hours long (as measured by endtime - starttime) and a bid was placed an hour before the auction ended, it would have a normalized timestamp of .1. Plot this distribution as a histogram.
Hint: to compute the difference in time between two dates, use julianday(time1)-julianday(time2).
      itemId                   startTime                     endTime    length
0  179353985  2023-09-28 17:00:54.000000  2023-10-02 18:14:00.000000  4.050764
1  177087535  2023-09-04 22:54:00.000000  2023-09-12 19:46:00.000000  7.869444
2  180876361  2023-10-14 03:18:40.000000  2023-10-19 04:04:40.000000  5.031944
3  177763109  2023-09-12 08:22:45.000000  2023-09-17 18:34:00.000000  5.424479
      itemId                     bidTime                   startTime  \
0  178348858  2023-09-18 16:11:04.587000  2023-09-18 14:29:56.000000   
1  178348858  2023-09-22 14:22:06.700000  2023-09-18 14:29:56.000000   
2  178348858  2023-09-23 12:35:18.157000  2023-09-18 14:29:56.000000   
3  178348858  2023-09-23 18:23:27.993000  2023-09-18 14:29:56.000000   
4  178348858  2023-09-23 18:37:47.213000  2023-09-18 14:29:56.000000   
                      endTime  time_norm  
0  2023-09-23 18:39:00.000000   0.986422  
1  2023-09-23 18:39:00.000000   0.227799  
2  2023-09-23 18:39:00.000000   0.048825  
3  2023-09-23 18:39:00.000000   0.002085  
4  2023-09-23 18:39:00.000000   0.000163  
What does this tell us? Are these results surprising?
q = """
select b.itemid, b.bidtime, a.starttime, a.endtime,
(julianday(endtime)-julianday(bidtime)) / a.length as time_norm
from bids as b
inner join (
    select itemid, starttime, endtime, 
    julianday(endtime) - julianday(starttime) as length
    from items
) as a
on b.itemid=a.itemid
"""
df = auctions.query(q)
print(df.head(2))      itemId                     bidTime                   starttime  \
0  178348858  2023-09-18 16:11:04.587000  2023-09-18 14:29:56.000000   
1  178348858  2023-09-22 14:22:06.700000  2023-09-18 14:29:56.000000   
                      endtime  time_norm  
0  2023-09-23 18:39:00.000000   0.986422  
1  2023-09-23 18:39:00.000000   0.227799  
Using WITH improves readability
q = """
with a as (
    select itemid, starttime, endtime, 
    julianday(endtime) - julianday(starttime) as length
    from items
)
select b.itemid, b.bidtime, a.starttime, a.endtime,
(julianday(endtime)-julianday(bidtime)) / a.length as time_norm
from bids as b
inner join a
on b.itemid=a.itemid
"""
df = auctions.query(q)
print(df.head(2))      itemId                     bidTime                   starttime  \
0  178348858  2023-09-18 16:11:04.587000  2023-09-18 14:29:56.000000   
1  178348858  2023-09-22 14:22:06.700000  2023-09-18 14:29:56.000000   
                      endtime  time_norm  
0  2023-09-23 18:39:00.000000   0.986422  
1  2023-09-23 18:39:00.000000   0.227799  
No unified linting tools such as pylint for python
Always assume that the code you write today will be inherited by a murderous psychopath who knows where you live!
not
not
Why put the comma first?
Align column names with manual spaces
If nesting subqueries, use consistent indentation
Additional suggestions:
sqlite DatabaseIf you have CSV files, you can create a database like this:
create_db.py
engine = create_engine("sqlite:////Users/hlukas/git/personal_website/static/econ-481/data/auctions.db") 
bids = pd.read_csv('/Users/hlukas/Google Drive/Raw Data/goodwill/final_data/the north face/bidding_data.csv')
items = pd.read_csv('/Users/hlukas/Google Drive/Raw Data/goodwill/final_data/the north face/item_data.csv')
items_small = items.sample(500)
bids_small = bids.loc[bids['itemId'].isin(items_small['itemId'])]
bids_small.to_sql(con=engine, name='bids', if_exists='replace') 
items_small.to_sql(con=engine, name='items', if_exists='replace')update_db.py
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine(f'sqlite:///{path}')
Base = declarative_base()
Base.metadata.create_all(engine)
items = pd.read_csv('/Users/hlukas/Google Drive/Raw Data/goodwill/final_data/the north face/item_data.csv')
items_small = items.sample(500)
items_small.to_sql(con=engine, name='items', if_exists='append') sqlalchemyWe don’t really need to write SQL if we don’t want to to use the package:
from sqlalchemy import MetaData, Table, select
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine(f'sqlite:///{path}')
Base = declarative_base()
Base.metadata.reflect(engine)
bids = Base.metadata.tables['bids']
query = select(bids.c.itemId, bids.c.bidAmount)\
    .where(bids.c.bidAmount==10)\
    .limit(5)
with Session(engine) as s:
    print(pd.DataFrame(s.execute(query)))      itemId  bidAmount
0  177106026       10.0
1  177963226       10.0
2  178438915       10.0
3  181622497       10.0
4  179719241       10.0
        itemId  n_bidders
0    165561698          1
1    170983900          1
2    172998011          2
3    173907435          1
4    174445924          3
..         ...        ...
167  182760698          1
168  182777527          1
169  182883873          2
170  182894197          1
171  182925212          2
[172 rows x 2 columns]
[1,2,3,4,5]\[ 5x \geq x + t \iff t \leq 5 \]