1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
|
# coding: utf-8
# # Example 02: SoSQL Queries
#
# Constructing custom queries to conserve bandwith and computational resources
# ## Setup
# In[1]:
import os
# Note: we don't need Pandas
# Filters allow you to accomplish many basic operations automatically
from sodapy import Socrata
# ## Find Some Data
#
# As in the first example, I'm using the Santa Fe political contribution dataset.
#
# `https://opendata.socrata.com/dataset/Santa-Fe-Contributors/f92i-ik66.json`
# In[2]:
socrata_domain = "opendata.socrata.com"
socrata_dataset_identifier = "f92i-ik66"
# If you choose to use a token, run the following command on the terminal (or add it to your .bashrc)
# $ export SODAPY_APPTOKEN=<token>
socrata_token = os.environ.get("SODAPY_APPTOKEN")
# In[3]:
client = Socrata(socrata_domain, socrata_token)
# ## Use Metadata to Plan Your Query
# You've probably looked through the column names and descriptions in the web UI,
# but it can be nice to have them right in your workspace as well.
# In[4]:
metadata = client.get_metadata(socrata_dataset_identifier)
[x["name"] for x in metadata["columns"]]
# In[5]:
meta_amount = [x for x in metadata["columns"] if x["name"] == "AMOUNT"][0]
meta_amount
# ## Efficiently Query for Data
# ### Restrict rows to above-average donations
# In[6]:
# Get the average from the metadata. Note that it's a string by default
meta_amount["cachedContents"]["average"]
# In[7]:
# Use the 'where' argument to filter the data before downloading it
results = client.get(socrata_dataset_identifier, where="amount >= 2433")
print(
"Total number of non-null results: {}".format(
meta_amount["cachedContents"]["non_null"]
)
)
print("Number of results downloaded: {}".format(len(results)))
results[:3]
# ### Restrict columns and order rows
# Often, you know which columns you want, so you can further simplify the download.
#
# It can also be valuable to have results in order, so that you can quickly grab the
# largest or smallest.
# In[8]:
results = client.get(
socrata_dataset_identifier,
where="amount < 2433",
select="amount, job",
order="amount ASC",
)
results[:3]
# ### Perform basic operations
# You can even accomplish some basic analytics operations like finding sums.
#
# If you're planning on doing further processing, note that the numeric outputs
# are strings by default.
# In[10]:
results = client.get(
socrata_dataset_identifier,
group="recipient",
select="sum(amount), recipient",
order="sum(amount) DESC",
)
results
# ### Break download into managable chunks
# Sometimes you do want all the data, but it would be too big for one download.
#
# By default, all queries have a limit of 1000 rows, but you can manually set it
# higher or lower. If you want to loop through results, just use `offset`
# In[11]:
results = client.get(socrata_dataset_identifier, limit=6, select="name, amount")
results
# In[11]:
loop_size = 3
num_loops = 2
for i in range(num_loops):
results = client.get(
socrata_dataset_identifier,
select="name, amount",
limit=loop_size,
offset=loop_size * i,
)
print("\n> Loop number: {}".format(i))
# This simply formats the output nicely
for result in results:
print(result)
# ### Query strings
# All of the queries above were made with method parameters,
# but you could also pass all the parameters at once in a
# SQL-like format
# In[13]:
query = """
select
name,
amount
where
amount > 1000
and amount < 2000
limit
5
"""
results = client.get(socrata_dataset_identifier, query=query)
results
# ### Free text search
# My brother just got a dog named Slider, so we were curious about how many other New York City dogs had that name.
#
# Searches with `q` match anywhere in the row, which allows you to quickly search through data with several free text columns of interest.
# In[20]:
nyc_dogs_domain = "data.cityofnewyork.us"
nyc_dogs_dataset_identifier = "nu7n-tubp"
nyc_dogs_client = Socrata(nyc_dogs_domain, socrata_token)
results = nyc_dogs_client.get(
nyc_dogs_dataset_identifier, q="Slider", select="animalname, breedname"
)
results
# # Going Further
#
# There's plenty more to do! Check out [Queries using SODA](https://dev.socrata.com/docs/queries/) for additional functionality
|