DECLARE start_date STRING DEFAULT '20170201';
DECLARE end_date STRING DEFAULT '20170228';
WITH visit AS (
SELECT
fullvisitorid,
MIN(date) AS date_first_visit
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN start_date AND end_date
GROUP BY fullvisitorid
),
transactions AS (
SELECT
fullvisitorid,
MIN(date) AS date_transactions
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*` AS ga,
UNNEST(ga.hits) AS hits
WHERE
hits.transaction.transactionId IS NOT NULL
AND
_TABLE_SUFFIX BETWEEN start_date AND end_date
GROUP BY fullvisitorid
),
device_transactions AS (
SELECT DISTINCT
fullvisitorid,
date,
device.deviceCategory
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*` AS ga,
UNNEST(ga.hits) AS hits
WHERE
hits.transaction.transactionId IS NOT NULL
AND
_TABLE_SUFFIX BETWEEN start_date AND end_date
),
visits_transactions AS (
SELECT
visit.fullvisitorid,
date_first_visit,
date_transactions,
device_transactions.deviceCategory AS device_transaction
FROM
visit
JOIN transactions
ON visit.fullvisitorid = transactions.fullvisitorid
JOIN device_transactions
ON visit.fullvisitorid = device_transactions.fullvisitorid
AND transactions.date_transactions = device_transactions.date
)
SELECT
fullvisitorid,
DATE_DIFF(PARSE_DATE('%Y%m%d', date_transactions), PARSE_DATE('%Y%m%d', date_first_visit), DAY) AS time,
device_transaction
FROM visits_transactions
ORDER BY fullvisitorid;
DECLARE start_date STRING DEFAULT '20170101';
DECLARE end_date STRING DEFAULT '20170630';
WITH daily_revenue AS (
SELECT
trafficSource.source AS source,
date,
SUM(productRevenue) / 1000000 AS revenue
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`,
UNNEST (hits) AS hits,
UNNEST (hits.product) AS product
WHERE
_table_suffix BETWEEN start_date AND end_date
GROUP BY
source, date
),
weekly_revenue AS (
SELECT
source,
CONCAT(EXTRACT(YEAR FROM (PARSE_DATE('%Y%m%d', date))), 'W', EXTRACT(WEEK FROM (PARSE_DATE('%Y%m%d', date)))) AS week,
SUM(revenue) AS revenue
FROM daily_revenue
GROUP BY source, week
),
monthly_revenue AS (
SELECT
source,
CONCAT(EXTRACT(YEAR FROM (PARSE_DATE('%Y%m%d', date))),'0', EXTRACT(MONTH FROM (PARSE_DATE('%Y%m%d', date)))) AS month,
SUM(revenue) AS revenue
FROM daily_revenue
GROUP BY source, month
),
top_source AS (
SELECT source, SUM(revenue) AS total_revenue
FROM daily_revenue
GROUP BY source
ORDER BY total_revenue DESC
LIMIT 1
),
max_revenues AS (
(
SELECT
'Daily' AS time_type,
date AS time,
source,
MAX(revenue) AS max_revenue
FROM daily_revenue
WHERE source = (SELECT source FROM top_source)
GROUP BY source, date
ORDER BY max_revenue DESC
LIMIT 1
)
UNION ALL
(
SELECT
'Weekly' AS time_type,
week AS time,
source,
MAX(revenue) AS max_revenue
FROM weekly_revenue
WHERE source = (SELECT source FROM top_source)
GROUP BY source, week
ORDER BY max_revenue DESC
LIMIT 1
)
UNION ALL
(
SELECT
'Monthly' AS time_type,
month AS time,
source,
MAX(revenue) AS max_revenue
FROM monthly_revenue
WHERE source = (SELECT source FROM top_source)
GROUP BY source, month
ORDER BY max_revenue DESC
LIMIT 1
)
)
SELECT
max_revenue
FROM max_revenues
ORDER BY max_revenue DESC;
WITH cte1 AS (
SELECT
CONCAT(EXTRACT(YEAR FROM (PARSE_DATE('%Y%m%d', date))), '0',
EXTRACT(MONTH FROM (PARSE_DATE('%Y%m%d', date)))) AS month,
SUM(totals.pageviews) / COUNT(DISTINCT fullVisitorId) AS avg_pageviews_non_purchase
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_2017*`,
UNNEST (hits) AS hits,
UNNEST (hits.product) AS product
WHERE
_table_suffix BETWEEN '0401' AND '0731'
AND totals.transactions IS NULL
AND product.productRevenue IS NULL
GROUP BY month
),
cte2 AS (
SELECT
CONCAT(EXTRACT(YEAR FROM (PARSE_DATE('%Y%m%d', date))), '0',
EXTRACT(MONTH FROM (PARSE_DATE('%Y%m%d', date)))) AS month,
SUM(totals.pageviews) / COUNT(DISTINCT fullVisitorId) AS avg_pageviews_purchase
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_2017*`,
UNNEST (hits) AS hits,
UNNEST (hits.product) AS product
WHERE
_table_suffix BETWEEN '0401' AND '0731'
AND totals.transactions >= 1
AND product.productRevenue IS NOT NULL
GROUP BY month
)
SELECT
month, avg_pageviews_purchase, avg_pageviews_non_purchase
FROM cte1 INNER JOIN cte2
USING(month)
ORDER BY month;
with product_and_quatity AS (
SELECT
DISTINCT v2ProductName AS other_purchased_products,
SUM(productQuantity) AS quatity
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_2017*`,
UNNEST(hits) AS hits,
UNNEST(hits.product) AS product
WHERE
_table_suffix BETWEEN '0701' AND '0731'
AND NOT REGEXP_CONTAINS(LOWER(v2ProductName), 'youtube')
AND fullVisitorID IN (
SELECT
DISTINCT fullVisitorId
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_2017*`,
UNNEST(hits) AS hits,
UNNEST(hits.product) AS product
WHERE
_table_suffix BETWEEN '0701' AND '0731'
AND REGEXP_CONTAINS(LOWER(v2ProductName), 'youtube')
)
GROUP BY v2ProductName
)
SELECT other_purchased_products
FROM product_and_quatity
ORDER BY quatity DESC
LIMIT 1;
WITH incident_stats AS (
SELECT
COUNT(descript) AS total_pub_intox
FROM
`bigquery-public-data.austin_incidents.incidents_2016`
WHERE
descript = 'PUBLIC INTOXICATION'
GROUP BY
date
),
average_and_stddev AS (
SELECT
AVG(total_pub_intox) AS avg,
STDDEV(total_pub_intox) AS stddev
FROM
incident_stats
),
daily_z_scores AS (
SELECT
date,
COUNT(descript) AS total_pub_intox,
ROUND((COUNT(descript) - a.avg) / a.stddev, 2) AS z_score
FROM
`bigquery-public-data.austin_incidents.incidents_2016`,
(SELECT avg, stddev FROM average_and_stddev) AS a
WHERE
descript = 'PUBLIC INTOXICATION'
GROUP BY
date, avg, stddev
)
SELECT
date
FROM
daily_z_scores
ORDER BY
z_score DESC
LIMIT 1
OFFSET 1
with page_visit_sequence AS (
SELECT
fullVisitorID,
visitID,
pagePath,
LEAD(timestamp, 1) OVER (PARTITION BY fullVisitorId, visitID order by timestamp) - timestamp AS page_duration,
LEAD(pagePath, 1) OVER (PARTITION BY fullVisitorId, visitID order by timestamp) AS next_page,
RANK() OVER (PARTITION BY fullVisitorId, visitID order by timestamp) AS step_number
FROM (
SELECT
pages.fullVisitorID,
pages.visitID,
pages.pagePath,
visitors.campaign,
MIN(pages.timestamp) timestamp
FROM (
SELECT
fullVisitorId,
visitId,
trafficSource.campaign campaign
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`,
UNNEST(hits) as hits
WHERE
_TABLE_SUFFIX BETWEEN '20170101' AND '20170131'
AND hits.type='PAGE'
AND REGEXP_CONTAINS(hits.page.pagePath, r'^/home')
AND REGEXP_CONTAINS(trafficSource.campaign, r'Data Share')
) AS visitors
JOIN(
SELECT
fullVisitorId,
visitId,
visitStartTime + hits.time / 1000 AS timestamp,
hits.page.pagePath AS pagePath
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`,
UNNEST(hits) as hits
WHERE
_TABLE_SUFFIX BETWEEN '20170101' AND '20170131'
) as pages
ON
visitors.fullVisitorID = pages.fullVisitorID
AND visitors.visitID = pages.visitID
GROUP BY
pages.fullVisitorID, visitors.campaign, pages.visitID, pages.pagePath
ORDER BY
pages.fullVisitorID, pages.visitID, timestamp
)
ORDER BY fullVisitorId, visitID, step_number
),
most_common_next_page AS (
SELECT
next_page,
COUNT(next_page) as page_count
FROM page_visit_sequence
WHERE
next_page IS NOT NULL
AND REGEXP_CONTAINS(pagePath, r'^/home')
GROUP BY next_page
ORDER BY page_count DESC
LIMIT 1
),
max_page_duration AS (
SELECT MAX(page_duration) as max_duration
FROM page_visit_sequence
WHERE
page_duration IS NOT NULL
AND REGEXP_CONTAINS(pagePath, r'^/home')
)
SELECT
next_page,
max_duration
FROM
most_common_next_page,
max_page_duration;
WITH MONTHLY_REVENUE AS (
SELECT
FORMAT_DATE("%Y%m", PARSE_DATE("%Y%m%d", date)) AS month,
trafficSource.source AS source,
ROUND(SUM(totals.totalTransactionRevenue) / 1000000, 2) AS revenue
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_2017*`
GROUP BY 1, 2
),
YEARLY_REVENUE AS (
SELECT
source,
SUM(revenue) AS total_revenue
FROM MONTHLY_REVENUE
GROUP BY source
),
TOP_SOURCE AS (
SELECT
source
FROM YEARLY_REVENUE
ORDER BY total_revenue DESC
LIMIT 1
),
SOURCE_MONTHLY_REVENUE AS (
SELECT
month,
source,
revenue
FROM MONTHLY_REVENUE
WHERE source IN (SELECT source FROM TOP_SOURCE)
),
REVENUE_DIFF AS (
SELECT
source,
ROUND(MAX(revenue), 2) AS max_revenue,
ROUND(MIN(revenue), 2) AS min_revenue,
ROUND(MAX(revenue) - MIN(revenue), 2) AS diff_revenue
FROM SOURCE_MONTHLY_REVENUE
GROUP BY source
)
SELECT
source,
diff_revenue
FROM REVENUE_DIFF;
WITH GET_CUS_ID AS (
SELECT
DISTINCT fullVisitorId as Henley_CUSTOMER_ID
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_201707*`,
UNNEST(hits) AS hits,
UNNEST(hits.product) as product
WHERE
product.v2ProductName = "YouTube Men's Vintage Henley"
AND product.productRevenue IS NOT NULL
)
SELECT
product.v2ProductName AS other_purchased_products
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_201707*` TAB_A
RIGHT JOIN GET_CUS_ID
ON GET_CUS_ID.Henley_CUSTOMER_ID=TAB_A.fullVisitorId,
UNNEST(hits) AS hits,
UNNEST(hits.product) as product
WHERE
TAB_A.fullVisitorId IN (
SELECT * FROM GET_CUS_ID
)
AND product.v2ProductName <> "YouTube Men's Vintage Henley"
AND product.productRevenue IS NOT NULL
GROUP BY
product.v2ProductName
ORDER BY
SUM(product.productQuantity) DESC
LIMIT 1;
SELECT
COUNT(DISTINCT MDaysUsers.user_pseudo_id) AS n_day_inactive_users_count
FROM
(
SELECT
user_pseudo_id
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
CROSS JOIN
UNNEST(T.event_params) AS event_params
WHERE
event_params.key = 'engagement_time_msec' AND event_params.value.int_value > 0
AND event_timestamp > UNIX_MICROS(TIMESTAMP_SUB(TIMESTAMP('2021-01-07 23:59:59'), INTERVAL 7 DAY))
AND _TABLE_SUFFIX BETWEEN '20210101' AND '20210107'
) AS MDaysUsers
LEFT JOIN
(
SELECT
user_pseudo_id
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` AS T
CROSS JOIN
UNNEST(T.event_params) AS event_params
WHERE
event_params.key = 'engagement_time_msec' AND event_params.value.int_value > 0
AND event_timestamp > UNIX_MICROS(TIMESTAMP_SUB(TIMESTAMP('2021-01-07 23:59:59'), INTERVAL 2 DAY))
AND _TABLE_SUFFIX BETWEEN '20210105' AND '20210107'
) AS NDaysUsers
ON MDaysUsers.user_pseudo_id = NDaysUsers.user_pseudo_id
WHERE
NDaysUsers.user_pseudo_id IS NULL;
WITH us_cases_by_date AS (
SELECT
date,
SUM( cumulative_confirmed ) AS cases
FROM
`bigquery-public-data.covid19_open_data.covid19_open_data`
WHERE
country_name="United States of America"
AND date between '2020-03-01' and '2020-04-30'
GROUP BY
date
ORDER BY
date ASC
)
, us_previous_day_comparison AS
(SELECT
date,
cases,
LAG(cases) OVER(ORDER BY date) AS previous_day,
cases - LAG(cases) OVER(ORDER BY date) AS net_new_cases,
(cases - LAG(cases) OVER(ORDER BY date))*100/LAG(cases) OVER(ORDER BY date) AS percentage_increase
FROM us_cases_by_date
)
SELECT
FORMAT_DATE('%m-%d', Date)
FROM
us_previous_day_comparison
ORDER BY
percentage_increase
DESC
LIMIT 1
WITH top20route AS (
SELECT
start_station_name, end_station_name, avg_bike_duration, avg_taxi_duration
FROM (
SELECT
start_station_name,
end_station_name,
ROUND(start_station_latitude, 3) AS ss_lat,
ROUND(start_station_longitude, 3) AS ss_long,
ROUND(end_station_latitude, 3) AS es_lat,
ROUND(end_station_longitude, 3) AS es_long,
AVG(tripduration) AS avg_bike_duration,
COUNT(*) AS bike_trips
FROM
`bigquery-public-data.new_york.citibike_trips`
WHERE
EXTRACT(YEAR from starttime) = 2016 AND
start_station_name != end_station_name
GROUP BY
start_station_name, end_station_name, ss_lat, ss_long, es_lat, es_long
ORDER BY
bike_trips DESC
LIMIT
20
) a
JOIN (
SELECT
ROUND(pickup_latitude, 3) AS pu_lat,
ROUND(pickup_longitude, 3) AS pu_long,
ROUND(dropoff_latitude, 3) AS do_lat,
ROUND(dropoff_longitude, 3) AS do_long,
AVG(UNIX_SECONDS(dropoff_datetime)-UNIX_SECONDS(pickup_datetime)) AS avg_taxi_duration,
COUNT(*) AS taxi_trips
FROM
`bigquery-public-data.new_york.tlc_yellow_trips_2016`
GROUP BY
pu_lat, pu_long, do_lat, do_long
) b
ON
a.ss_lat = b.pu_lat AND
a.es_lat = b.do_lat AND
a.ss_long = b.pu_long AND
a.es_long = b.do_long
)
SELECT start_station_name
FROM top20route
WHERE avg_bike_duration < avg_taxi_duration
ORDER BY
avg_bike_duration
DESC
LIMIT 1
SELECT
ROUND(MIN(trip_seconds) / 60, 0) AS min_minutes,
ROUND(MAX(trip_seconds) / 60, 0) AS max_minutes,
COUNT(*) AS total_trips,
AVG(fare) AS average_fare
FROM (
SELECT
trip_seconds,
NTILE(6) OVER (ORDER BY trip_seconds) AS quantile,
fare
FROM
`bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE
trip_seconds BETWEEN 0 AND 3600
)
GROUP BY
quantile
ORDER BY
min_minutes, max_minutes;
SELECT
age.country_name,
SUM(age.population) AS under_25,
pop.midyear_population AS total,
ROUND((SUM(age.population) / pop.midyear_population) * 100,2) AS pct_under_25
FROM (
SELECT
country_name,
population,
country_code
FROM
`bigquery-public-data.census_bureau_international.midyear_population_agespecific`
WHERE
year =2020
AND age < 20) age
INNER JOIN (
SELECT
midyear_population,
country_code
FROM
`bigquery-public-data.census_bureau_international.midyear_population`
WHERE
year = 2020) pop
ON
age.country_code = pop.country_code
GROUP BY
1,
3
ORDER BY
4 DESC
/* Remove limit for visualization */
LIMIT
10
WITH transrate AS (
SELECT
DATE(CAST(year AS INT64), CAST(mo AS INT64), CAST(da AS INT64)) AS observation_date
, ROUND((temp - 32.0) / 1.8, 1) AS temp_mean_c -- using Celsius instead of Fahrenheit
, ROUND(prcp * 2.54, 1) AS prcp_cm -- from inches to centimeters
, ROUND(CAST(wdsp AS FLOAT64) * 1.852 / 3.6, 1) AS wdsp_ms -- from knots to meters per second
FROM `bigquery-public-data.noaa_gsod.gsod*`
WHERE _TABLE_SUFFIX = "2019"
AND CAST(mo AS INT64) <= 3
AND stn in (SELECT usaf FROM `bigquery-public-data.noaa_gsod.stations` WHERE name = "ROCHESTER")
),
moving_avg AS (
SELECT
observation_date
, temp_mean_c
, prcp_cm
, wdsp_ms
, AVG(temp_mean_c) OVER (ORDER BY observation_date ROWS 7 PRECEDING) AS temp_moving_avg
, AVG(prcp_cm) OVER (ORDER BY observation_date ROWS 7 PRECEDING) AS prcp_moving_avg
, AVG(wdsp_ms) OVER (ORDER BY observation_date ROWS 7 PRECEDING) AS wdsp_moving_avg
FROM transrate
),
lag_moving_avg AS (
SELECT
observation_date
, temp_mean_c
, prcp_cm
, wdsp_ms
, LAG(temp_moving_avg, 1) OVER (ORDER BY observation_date) AS lag1_temp_moving_avg
, LAG(prcp_moving_avg, 1) OVER (ORDER BY observation_date) AS lag1_prcp_moving_avg
, LAG(wdsp_moving_avg, 1) OVER (ORDER BY observation_date) AS lag1_wdsp_moving_avg
, LAG(temp_moving_avg, 2) OVER (ORDER BY observation_date) AS lag2_temp_moving_avg
, LAG(prcp_moving_avg, 2) OVER (ORDER BY observation_date) AS lag2_prcp_moving_avg
, LAG(wdsp_moving_avg, 2) OVER (ORDER BY observation_date) AS lag2_wdsp_moving_avg
, LAG(temp_moving_avg, 3) OVER (ORDER BY observation_date) AS lag3_temp_moving_avg
, LAG(prcp_moving_avg, 3) OVER (ORDER BY observation_date) AS lag3_prcp_moving_avg
, LAG(wdsp_moving_avg, 3) OVER (ORDER BY observation_date) AS lag3_wdsp_moving_avg
, LAG(temp_moving_avg, 4) OVER (ORDER BY observation_date) AS lag4_temp_moving_avg
, LAG(prcp_moving_avg, 4) OVER (ORDER BY observation_date) AS lag4_prcp_moving_avg
, LAG(wdsp_moving_avg, 4) OVER (ORDER BY observation_date) AS lag4_wdsp_moving_avg
, LAG(temp_moving_avg, 5) OVER (ORDER BY observation_date) AS lag5_temp_moving_avg
, LAG(prcp_moving_avg, 5) OVER (ORDER BY observation_date) AS lag5_prcp_moving_avg
, LAG(wdsp_moving_avg, 5) OVER (ORDER BY observation_date) AS lag5_wdsp_moving_avg
, LAG(temp_moving_avg, 6) OVER (ORDER BY observation_date) AS lag6_temp_moving_avg
, LAG(prcp_moving_avg, 6) OVER (ORDER BY observation_date) AS lag6_prcp_moving_avg
, LAG(wdsp_moving_avg, 6) OVER (ORDER BY observation_date) AS lag6_wdsp_moving_avg
, LAG(temp_moving_avg, 7) OVER (ORDER BY observation_date) AS lag7_temp_moving_avg
, LAG(prcp_moving_avg, 7) OVER (ORDER BY observation_date) AS lag7_prcp_moving_avg
, LAG(wdsp_moving_avg, 7) OVER (ORDER BY observation_date) AS lag7_wdsp_moving_avg
, LAG(temp_moving_avg, 8) OVER (ORDER BY observation_date) AS lag8_temp_moving_avg
, LAG(prcp_moving_avg, 8) OVER (ORDER BY observation_date) AS lag8_prcp_moving_avg
, LAG(wdsp_moving_avg, 8) OVER (ORDER BY observation_date) AS lag8_wdsp_moving_avg
FROM moving_avg
)
SELECT
observation_date
, temp_mean_c
, prcp_cm
, wdsp_ms
, ROUND(lag1_temp_moving_avg, 1) AS lag1_temp_moving_avg
, ROUND(lag1_prcp_moving_avg, 1) AS lag1_prcp_moving_avg
, ROUND(lag1_wdsp_moving_avg, 1) AS lag1_wdsp_moving_avg
, ROUND(lag1_temp_moving_avg - lag2_temp_moving_avg, 1) AS diff2_temp_moving_avg
, ROUND(lag1_prcp_moving_avg - lag2_prcp_moving_avg, 1) AS diff2_prcp_moving_avg
, ROUND(lag1_wdsp_moving_avg - lag2_wdsp_moving_avg, 1) AS diff2_wdsp_moving_avg
, ROUND(lag2_temp_moving_avg, 1) AS lag2_temp_moving_avg
, ROUND(lag2_prcp_moving_avg, 1) AS lag2_prcp_moving_avg
, ROUND(lag2_wdsp_moving_avg, 1) AS lag2_wdsp_moving_avg
, ROUND(lag2_temp_moving_avg - lag3_temp_moving_avg, 1) AS diff3_temp_moving_avg
, ROUND(lag2_prcp_moving_avg - lag3_prcp_moving_avg, 1) AS diff3_prcp_moving_avg
, ROUND(lag2_wdsp_moving_avg - lag3_wdsp_moving_avg, 1) AS diff3_wdsp_moving_avg
, ROUND(lag3_temp_moving_avg, 1) AS lag3_temp_moving_avg
, ROUND(lag3_prcp_moving_avg, 1) AS lag3_prcp_moving_avg
, ROUND(lag3_wdsp_moving_avg, 1) AS lag3_wdsp_moving_avg
, ROUND(lag3_temp_moving_avg - lag4_temp_moving_avg, 1) AS diff4_temp_moving_avg
, ROUND(lag3_prcp_moving_avg - lag4_prcp_moving_avg, 1) AS diff4_prcp_moving_avg
, ROUND(lag3_wdsp_moving_avg - lag4_wdsp_moving_avg, 1) AS diff4_wdsp_moving_avg
, ROUND(lag4_temp_moving_avg, 1) AS lag4_temp_moving_avg
, ROUND(lag4_prcp_moving_avg, 1) AS lag4_prcp_moving_avg
, ROUND(lag4_wdsp_moving_avg, 1) AS lag4_wdsp_moving_avg
, ROUND(lag4_temp_moving_avg - lag5_temp_moving_avg, 1) AS diff5_temp_moving_avg
, ROUND(lag4_prcp_moving_avg - lag5_prcp_moving_avg, 1) AS diff5_prcp_moving_avg
, ROUND(lag4_wdsp_moving_avg - lag5_wdsp_moving_avg, 1) AS diff5_wdsp_moving_avg
, ROUND(lag5_temp_moving_avg, 1) AS lag5_temp_moving_avg
, ROUND(lag5_prcp_moving_avg, 1) AS lag5_prcp_moving_avg
, ROUND(lag5_wdsp_moving_avg, 1) AS lag5_wdsp_moving_avg
, ROUND(lag5_temp_moving_avg - lag6_temp_moving_avg, 1) AS diff6_temp_moving_avg
, ROUND(lag5_prcp_moving_avg - lag6_prcp_moving_avg, 1) AS diff6_prcp_moving_avg
, ROUND(lag5_wdsp_moving_avg - lag6_wdsp_moving_avg, 1) AS diff6_wdsp_moving_avg
, ROUND(lag6_temp_moving_avg, 1) AS lag6_temp_moving_avg
, ROUND(lag6_prcp_moving_avg, 1) AS lag6_prcp_moving_avg
, ROUND(lag6_wdsp_moving_avg, 1) AS lag6_wdsp_moving_avg
, ROUND(lag6_temp_moving_avg - lag7_temp_moving_avg, 1) AS diff7_temp_moving_avg
, ROUND(lag6_prcp_moving_avg - lag7_prcp_moving_avg, 1) AS diff7_prcp_moving_avg
, ROUND(lag6_wdsp_moving_avg - lag7_wdsp_moving_avg, 1) AS diff7_wdsp_moving_avg
, ROUND(lag7_temp_moving_avg, 1) AS lag7_temp_moving_avg
, ROUND(lag7_prcp_moving_avg, 1) AS lag7_prcp_moving_avg
, ROUND(lag7_wdsp_moving_avg, 1) AS lag7_wdsp_moving_avg
, ROUND(lag7_temp_moving_avg - lag8_temp_moving_avg, 1) AS diff8_temp_moving_avg
, ROUND(lag7_prcp_moving_avg - lag8_prcp_moving_avg, 1) AS diff8_prcp_moving_avg
, ROUND(lag7_wdsp_moving_avg - lag8_wdsp_moving_avg, 1) AS diff8_wdsp_moving_avg
, ROUND(lag8_temp_moving_avg, 1) AS lag8_temp_moving_avg
, ROUND(lag8_prcp_moving_avg, 1) AS lag8_prcp_moving_avg
, ROUND(lag8_wdsp_moving_avg, 1) AS lag8_wdsp_moving_avg
FROM lag_moving_avg
WHERE
lag8_temp_moving_avg IS NOT NULL
ORDER BY observation_date;
-- all result rounded to 1 decimal place
WITH hurricane_geometry AS (
SELECT
* EXCEPT (longitude, latitude),
ST_GEOGPOINT(longitude, latitude) AS geom,
MAX(usa_wind) OVER (PARTITION BY sid) AS max_wnd_speed
FROM
`bigquery-public-data.noaa_hurricanes.hurricanes`
WHERE
season = '2020'
AND basin = 'NA'
AND name != 'NOT NAMED'
),
dist_between_points AS (
SELECT
sid,
name,
season,
iso_time,
max_wnd_speed,
geom,
ST_DISTANCE(geom, LAG(geom, 1) OVER (PARTITION BY sid ORDER BY iso_time ASC)) / 1000 AS dist
FROM
hurricane_geometry
),
total_distances AS (
SELECT
sid,
name,
season,
iso_time,
max_wnd_speed,
geom,
SUM(dist) OVER (PARTITION BY sid ORDER BY iso_time ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_distance,
SUM(dist) OVER (PARTITION BY sid) AS total_dist
FROM
dist_between_points
),
ranked_hurricanes AS (
SELECT
*,
DENSE_RANK() OVER (ORDER BY total_dist DESC) AS dense_rank
FROM
total_distances
)
SELECT
ST_Y(geom)
FROM
ranked_hurricanes
WHERE
dense_rank = 2
ORDER BY
cumulative_distance
DESC
LIMIT 1
;
WITH params AS (
SELECT ST_GeogPoint(-87.6847, 41.8319) AS center,
50 AS maxdist_km
),
distance_from_center AS (
SELECT
id,
name,
state,
ST_GeogPoint(longitude, latitude) AS loc,
ST_Distance(ST_GeogPoint(longitude, latitude), params.center) AS dist_meters
FROM
`bigquery-public-data.ghcn_d.ghcnd_stations`,
params
WHERE ST_DWithin(ST_GeogPoint(longitude, latitude), params.center, params.maxdist_km*1000)
),
nearest_stations AS (
SELECT
*,
RANK() OVER (ORDER BY dist_meters ASC) AS rank
FROM
distance_from_center
),
nearest_nstations AS (
SELECT
station.*
FROM
nearest_stations AS station, params
)
SELECT * from nearest_nstations
SELECT
bike_number,
AVG(dist_in_m) AS avg_dist_m,
SUM(dist_in_m) AS total_dist_m
FROM (
SELECT
ST_DISTANCE(
ST_GEOGPOINT(start_lon, start_lat),
ST_GEOGPOINT(end_lon, end_lat)
) AS dist_in_m,
starts.bike_number
FROM (
SELECT
latitude AS start_lat,
longitude AS start_lon,
bike_number,
trip_id
FROM `bigquery-public-data.san_francisco.bikeshare_trips` trips
LEFT JOIN `bigquery-public-data.san_francisco.bikeshare_stations` stations
ON trips.start_station_id = stations.station_id
) starts
LEFT JOIN (
SELECT
latitude AS end_lat,
longitude AS end_lon,
bike_number,
trip_id
FROM `bigquery-public-data.san_francisco.bikeshare_trips` trips
LEFT JOIN `bigquery-public-data.san_francisco.bikeshare_stations` stations
ON trips.end_station_id = stations.station_id
) ends ON ends.trip_id = starts.trip_id
)
GROUP BY bike_number
ORDER BY total_dist_m DESC
SELECT
tz.zone_name AS pickup_zone,
tz1.zone_name AS dropoff_zone,
time_duration_in_secs,
driving_speed_miles_per_hour,
tip_rate
FROM
(
SELECT *,
TIMESTAMP_DIFF(dropoff_datetime,pickup_datetime,SECOND) as time_duration_in_secs,
ROUND(trip_distance / (TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, SECOND) / 3600), 2) AS driving_speed_miles_per_hour,
(CASE WHEN total_amount=0 THEN 0
ELSE (tip_amount*100/total_amount) END) as tip_rate
FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2016`
) t
INNER JOIN `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom` tz
ON t.pickup_location_id = tz.zone_id
INNER JOIN `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom` tz1
ON t.dropoff_location_id = tz1.zone_id
WHERE
pickup_datetime BETWEEN '2016-07-01' AND '2016-07-07'
AND dropoff_datetime BETWEEN '2016-07-01' AND '2016-07-07'
AND TIMESTAMP_DIFF(dropoff_datetime,pickup_datetime,SECOND) > 0
AND passenger_count > 5
AND trip_distance >= 10
AND tip_amount >= 0
AND tolls_amount >= 0
AND mta_tax >= 0
AND fare_amount >= 0
AND total_amount >= 0
ORDER BY total_amount DESC
LIMIT 10;
SELECT
-- Create a timestamp from the date components.
TIMESTAMP(CONCAT(year,"-",mo,"-",da)) AS timestamp,
-- Replace numerical null values with actual null
AVG(IF (temp=9999.9,
null,
temp)) AS temperature,
AVG(IF (wdsp="999.9",
null,
CAST(wdsp AS Float64))) AS wind_speed,
AVG(IF (prcp=99.99,
0,
prcp)) AS precipitation
FROM
`bigquery-public-data.noaa_gsod.gsod20*`
WHERE
CAST(YEAR AS INT64) > 2010
AND CAST(YEAR AS INT64) < 2021
AND CAST(MO AS INT64) = 6
AND CAST(DA AS INT64) = 12
AND stn = "725030" -- La Guardia
GROUP BY
timestamp
ORDER BY
timestamp ASC;
WITH WashingtonStations2023 AS
(
SELECT
weather.stn AS station_id,
ANY_VALUE(station.name) AS name
FROM
`bigquery-public-data.noaa_gsod.stations` AS station
INNER JOIN
`bigquery-public-data.noaa_gsod.gsod2023` AS weather
ON
station.usaf = weather.stn
WHERE
station.state = 'WA'
AND
station.usaf != '999999'
GROUP BY
station_id
),
prcp2023 AS (
SELECT
washington_stations.name,
(
SELECT
COUNT(*)
FROM
`bigquery-public-data.noaa_gsod.gsod2023` AS weather
WHERE
washington_stations.station_id = weather.stn
AND
prcp > 0
AND
prcp !=99.99
)
AS rainy_days
FROM
WashingtonStations2023 AS washington_stations
ORDER BY
rainy_days DESC
),
WashingtonStations2022 AS
(
SELECT
weather.stn AS station_id,
ANY_VALUE(station.name) AS name
FROM
`bigquery-public-data.noaa_gsod.stations` AS station
INNER JOIN
`bigquery-public-data.noaa_gsod.gsod2022` AS weather
ON
station.usaf = weather.stn
WHERE
station.state = 'WA'
AND
station.usaf != '999999'
GROUP BY
station_id
),
prcp2022 AS (
SELECT
washington_stations.name,
(
SELECT
COUNT(*)
FROM
`bigquery-public-data.noaa_gsod.gsod2022` AS weather
WHERE
washington_stations.station_id = weather.stn
AND
prcp > 0
AND
prcp != 99.99
)
AS rainy_days
FROM
WashingtonStations2022 AS washington_stations
ORDER BY
rainy_days DESC
)
SELECT prcp2023.name
FROM prcp2023
JOIN prcp2022
on prcp2023.name = prcp2022.name
WHERE prcp2023.rainy_days > 150
AND prcp2023.rainy_days < prcp2022.rainy_days
WITH DUBUQUE_LIQUOR_CTE AS (
SELECT
CASE
WHEN UPPER(category_name) LIKE 'BUTTERSCOTCH SCHNAPPS' THEN 'All Other' --Edge case is not a scotch
WHEN UPPER(category_name) LIKE '%WHISKIES'
AND UPPER(category_name) NOT LIKE '%RYE%'
AND UPPER(category_name) NOT LIKE '%BOURBON%'
AND UPPER(category_name) NOT LIKE '%SCOTCH%' THEN 'Other Whiskey'
WHEN UPPER(category_name) LIKE '%RYE%' THEN 'Rye Whiskey'
WHEN UPPER(category_name) LIKE '%BOURBON%' THEN 'Bourbon Whiskey'
WHEN UPPER(category_name) LIKE '%SCOTCH%' THEN 'Scotch Whiskey'
ELSE 'All Other'
END AS category_group,
EXTRACT(MONTH FROM date) AS month, -- At the time of this query, there is only data until month 6.
LEFT(CAST(zip_code AS string),5) AS zip_code, -- Casting to string necessary because zip_code has a mix of int & str types.
ROUND(SUM(sale_dollars), 2) AS sale_dollars_sum,
FROM
bigquery-public-data.iowa_liquor_sales.sales
WHERE
UPPER(county) = 'DUBUQUE'
AND EXTRACT(YEAR FROM date) = 2022
GROUP BY
category_group,
month,
zip_code
ORDER BY
category_group,
month,
zip_code
),
DUBUQUE_POPULATION_CTE AS (
SELECT
zipcode,
SUM(population) AS population_sum
FROM bigquery-public-data.census_bureau_usa.population_by_zip_2010
WHERE
minimum_age >= 21
GROUP BY
zipcode
),
MONTH_INFO AS (
SELECT
l.month,
l.zip_code,
l.sale_dollars_sum,
ROUND(sale_dollars_sum/p.population_sum, 2) AS dollars_per_capita
FROM
DUBUQUE_LIQUOR_CTE AS l
LEFT JOIN
DUBUQUE_POPULATION_CTE AS p
ON l.zip_code = p.zipcode
WHERE
category_group = 'Bourbon Whiskey'
GROUP BY
category_group,
zip_code,
month,
sale_dollars_sum,
zipcode,
population_sum
ORDER BY
zip_code,
month
),
zip_code_sales AS (
SELECT
zip_code,
SUM(sale_dollars_sum) AS total_sale_dollars_sum
FROM MONTH_INFO
GROUP BY zip_code
),
ranked_zip_codes AS (
SELECT
zip_code,
total_sale_dollars_sum,
ROW_NUMBER() OVER (ORDER BY total_sale_dollars_sum DESC) AS rank
FROM zip_code_sales
)
SELECT
t.month,
t.zip_code,
t.dollars_per_capita
FROM MONTH_INFO t
JOIN ranked_zip_codes r
ON t.zip_code = r.zip_code
WHERE r.rank = 3
ORDER BY t.month;
SELECT
c.fall_color,
SUM(d.count_growth) AS change
FROM (
SELECT
fall_color,
UPPER(species_scientific_name) AS latin
FROM
`bigquery-public-data.new_york.tree_species`)c
JOIN (
SELECT
IFNULL(a.upper_latin,
b.upper_latin) AS latin,
(IFNULL(count_2015,
0)-IFNULL(count_1995,
0)) AS count_growth
FROM (
SELECT
UPPER(spc_latin) AS upper_latin,
spc_common,
COUNT(*) AS count_2015
FROM
`bigquery-public-data.new_york.tree_census_2015`
WHERE
status="Alive"
GROUP BY
spc_latin,
spc_common)a
FULL OUTER JOIN (
SELECT
UPPER(spc_latin) AS upper_latin,
COUNT(*) AS count_1995
FROM
`bigquery-public-data.new_york.tree_census_1995`
WHERE
status !="Dead"
GROUP BY
spc_latin)b
ON
a.upper_latin=b.upper_latin
ORDER BY
count_growth DESC)d
ON
d.latin=c.latin
GROUP BY
fall_color
ORDER BY
change DESC
WITH stations AS (
SELECT station_id
FROM
`bigquery-public-data.san_francisco_bikeshare.bikeshare_station_info` AS stainfo
WHERE stainfo.region_id = (
SELECT region.region_id
FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_regions` AS region
WHERE region.name = "Berkeley"
)
),
meta_data AS (
SELECT
round(st_distance(start_station_geom, end_station_geom), 1) as distancia_metros,
round(st_distance(start_station_geom, end_station_geom) / duration_sec, 1) as velocidade_media
FROM
`bigquery-public-data.san_francisco_bikeshare.bikeshare_trips` AS trips
WHERE
cast(trips.start_station_id as string) IN (SELECT station_id FROM stations)
AND cast(trips.end_station_id as string) IN (SELECT station_id FROM stations)
AND start_station_latitude IS NOT NULL
AND start_station_longitude IS NOT NULL
AND end_station_latitude IS NOT NULL
AND end_station_longitude IS NOT NULL
AND st_distance(start_station_geom, end_station_geom) > 1000
ORDER BY
velocidade_media DESC
LIMIT 1
)
SELECT velocidade_media as max_velocity
FROM meta_data;
WITH results AS (
SELECT
growth.country_name,
growth.net_migration,
CAST(area.country_area as INT64) as country_area
FROM (
SELECT
country_name,
net_migration,
country_code
FROM
`bigquery-public-data.census_bureau_international.birth_death_growth_rates`
WHERE
year = 2017
) growth
INNER JOIN (
SELECT
country_area,
country_code
FROM
`bigquery-public-data.census_bureau_international.country_names_area`
WHERE
country_area > 500
) area
ON
growth.country_code = area.country_code
ORDER BY
net_migration DESC
LIMIT 3
)
SELECT country_name, net_migration
FROM results;
WITH acs_2018 AS (
SELECT
geo_id,
median_income AS median_income_2018
FROM
`bigquery-public-data.census_bureau_acs.censustract_2018_5yr`
),
acs_2015 AS (
SELECT
geo_id,
median_income AS median_income_2015
FROM
`bigquery-public-data.census_bureau_acs.censustract_2015_5yr` ),
acs_diff AS (
SELECT
a18.geo_id,
a18.median_income_2018,
a15.median_income_2015,
(a18.median_income_2018 - a15.median_income_2015) AS median_income_diff,
FROM
acs_2018 a18
JOIN
acs_2015 a15
ON
a18.geo_id = a15.geo_id
),
max_geo_id AS (
SELECT
geo_id
FROM
acs_diff
WHERE
median_income_diff IS NOT NULL
AND acs_diff.geo_id in (
SELECT
geo_id
FROM
`bigquery-public-data.geo_census_tracts.census_tracts_california`
)
ORDER BY
median_income_diff DESC
LIMIT 1
)
SELECT
tracts.tract_ce as tract_code
FROM
max_geo_id
JOIN
`bigquery-public-data.geo_census_tracts.census_tracts_california` AS tracts
ON
max_geo_id.geo_id = tracts.geo_id;
WITH all_zip_tract_join AS (
SELECT
zips.zip_code,
zips.functional_status as zip_functional_status,
tracts.tract_ce,
tracts.geo_id as tract_geo_id,
tracts.functional_status as tract_functional_status,
ST_Area(ST_Intersection(tracts.tract_geom, zips.zip_code_geom))
/ ST_Area(tracts.tract_geom) as tract_pct_in_zip_code
FROM
`bigquery-public-data.geo_census_tracts.us_census_tracts_national` tracts,
`bigquery-public-data.geo_us_boundaries.zip_codes` zips
WHERE
ST_Intersects(tracts.tract_geom, zips.zip_code_geom)
),
zip_tract_join AS (
SELECT * FROM all_zip_tract_join WHERE tract_pct_in_zip_code > 0
),
census_totals AS (
-- convert averages to additive totals
SELECT
geo_id,
total_pop,
total_pop * income_per_capita AS total_income
FROM
`bigquery-public-data.census_bureau_acs.censustract_2017_5yr`
),
joined AS (
-- join with precomputed census/zip pairs,
-- compute zip's share of tract
SELECT
zip_code,
total_pop * tract_pct_in_zip_code AS zip_pop,
total_income * tract_pct_in_zip_code AS zip_income
FROM census_totals c
JOIN zip_tract_join ztj
ON c.geo_id = ztj.tract_geo_id
),
sums AS (
-- aggregate all "pieces" of zip code
SELECT
zip_code,
SUM(zip_pop) AS zip_pop,
SUM(zip_income) AS zip_total_inc
FROM joined
GROUP BY zip_code
),
zip_pop_income AS (
SELECT
zip_code, zip_pop,
-- convert to averages
zip_total_inc / zip_pop AS income_per_capita
FROM sums
),
zipcodes_within_distance as (
SELECT
zip_code, zip_code_geom
FROM
`bigquery-public-data.geo_us_boundaries.zip_codes`
WHERE
state_code = 'WA' -- Washington state code
AND
ST_DWithin(
ST_GeogPoint(-122.191667, 47.685833),
zip_code_geom,
8046.72
)
)
select
stats.zip_code,
ROUND(stats.zip_pop, 1) as zip_population,
ROUND(stats.income_per_capita, 1) as average_income
from
zipcodes_within_distance area
join
zip_pop_income stats
on area.zip_code = stats.zip_code
ORDER BY
average_income DESC;
WITH poverty_and_natality AS (
SELECT
EXTRACT(YEAR FROM n.Year) AS data_year,
p.geo_id AS county_fips,
(p.poverty / p.pop_determined_poverty_status) * 100 AS poverty_rate,
SUM(n.Births) AS total_births,
SUM(CASE WHEN n.Maternal_Morbidity_YN = 0 THEN n.Births ELSE 0 END) AS births_without_morbidity
FROM
`bigquery-public-data.census_bureau_acs.county_2015_5yr` p
JOIN
`bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_maternal_morbidity` n
ON p.geo_id = n.County_of_Residence_FIPS
WHERE
p.pop_determined_poverty_status > 0 AND
EXTRACT(YEAR FROM n.Year) = 2016
GROUP BY
p.geo_id, p.poverty, p.pop_determined_poverty_status, EXTRACT(YEAR FROM n.Year)
UNION ALL
SELECT
EXTRACT(YEAR FROM n.Year) AS data_year,
p.geo_id AS county_fips,
(p.poverty / p.pop_determined_poverty_status) * 100 AS poverty_rate,
SUM(n.Births) AS total_births,
SUM(CASE WHEN n.Maternal_Morbidity_YN = 0 THEN n.Births ELSE 0 END) AS births_without_morbidity
FROM
`bigquery-public-data.census_bureau_acs.county_2016_5yr` p
JOIN
`bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_maternal_morbidity` n
ON p.geo_id = n.County_of_Residence_FIPS
WHERE
p.pop_determined_poverty_status > 0 AND
EXTRACT(YEAR FROM n.Year) = 2017
GROUP BY
p.geo_id, p.poverty, p.pop_determined_poverty_status, EXTRACT(YEAR FROM n.Year)
UNION ALL
SELECT
EXTRACT(YEAR FROM n.Year) AS data_year,
p.geo_id AS county_fips,
(p.poverty / p.pop_determined_poverty_status) * 100 AS poverty_rate,
SUM(n.Births) AS total_births,
SUM(CASE WHEN n.Maternal_Morbidity_YN = 0 THEN n.Births ELSE 0 END) AS births_without_morbidity
FROM
`bigquery-public-data.census_bureau_acs.county_2017_5yr` p
JOIN
`bigquery-public-data.sdoh_cdc_wonder_natality.county_natality_by_maternal_morbidity` n
ON p.geo_id = n.County_of_Residence_FIPS
WHERE
p.pop_determined_poverty_status > 0 AND
EXTRACT(YEAR FROM n.Year) = 2018
GROUP BY
p.geo_id, p.poverty, p.pop_determined_poverty_status, EXTRACT(YEAR FROM n.Year)
)
SELECT
data_year,
CORR(poverty_rate, (births_without_morbidity / total_births) * 100) AS correlation_coefficient
FROM
poverty_and_natality
GROUP BY
data_year
WITH acs_2018 AS (
SELECT geo_id, unemployed_pop AS unemployed_2018
FROM `bigquery-public-data.census_bureau_acs.county_2018_5yr`
),
acs_2015 AS (
SELECT geo_id, unemployed_pop AS unemployed_2015
FROM `bigquery-public-data.census_bureau_acs.county_2015_5yr`
),
unemployed_change AS (
SELECT
u18.unemployed_2018, u18.geo_id, u15.unemployed_2015,
(u18.unemployed_2018 - u15.unemployed_2015) AS u_change
FROM acs_2018 u18
JOIN acs_2015 u15
ON u18.geo_id = u15.geo_id
),
duals_Jan_2018 AS (
SELECT Public_Total AS duals_2018, County_Name, FIPS
FROM `bigquery-public-data.sdoh_cms_dual_eligible_enrollment.dual_eligible_enrollment_by_county_and_program`
WHERE Date = '2018-12-01'
),
duals_Jan_2015 AS (
SELECT Public_Total AS duals_2015, County_Name, FIPS
FROM `bigquery-public-data.sdoh_cms_dual_eligible_enrollment.dual_eligible_enrollment_by_county_and_program`
WHERE Date = '2015-12-01'
),
duals_change AS (
SELECT
d18.FIPS, d18.County_Name, d18.duals_2018, d15.duals_2015,
(d18.duals_2018 - d15.duals_2015) AS total_duals_diff
FROM duals_Jan_2018 d18
JOIN duals_Jan_2015 d15
ON d18.FIPS = d15.FIPS
),
corr_tbl AS (
SELECT unemployed_change.geo_id, duals_change.County_Name, unemployed_change.u_change, duals_change.total_duals_diff
FROM unemployed_change
JOIN duals_change
ON unemployed_change.geo_id = duals_change.FIPS
)
SELECT COUNT(*)
FROM corr_tbl
WHERE
u_change >0
AND
corr_tbl.total_duals_diff < 0
SELECT
incidents AS highest_monthly_thefts
FROM (
SELECT
year,
EXTRACT(MONTH FROM date) AS month,
COUNT(1) AS incidents,
RANK() OVER (PARTITION BY year ORDER BY COUNT(1) DESC) AS ranking
FROM
`bigquery-public-data.chicago_crime.crime`
WHERE
primary_type = 'MOTOR VEHICLE THEFT'
AND year = 2016
GROUP BY
year,
month
)
WHERE
ranking = 1
ORDER BY
year DESC
LIMIT 1;
SELECT
year,
incidents
FROM (
SELECT
year,
EXTRACT(MONTH
FROM
date) AS month,
COUNT(1) AS incidents,
RANK() OVER (PARTITION BY year ORDER BY COUNT(1) DESC) AS ranking
FROM
`bigquery-public-data.chicago_crime.crime`
WHERE
primary_type = 'MOTOR VEHICLE THEFT'
AND year BETWEEN 2010 AND 2016
GROUP BY
year,
month )
WHERE
ranking = 1
ORDER BY
year ASC
SELECT
T1.targetId AS target_id,
T1.datasourceId,
targets.approvedSymbol AS approved_symbol,
overall_associations.score AS overall_score
FROM
`bigquery-public-data.open_targets_platform.associationByDatasourceDirect` as T1
JOIN
`bigquery-public-data.open_targets_platform.targets` AS targets
ON
targetId = targets.id
JOIN
`bigquery-public-data.open_targets_platform.associationByOverallDirect` AS overall_associations
ON
T1.targetId = overall_associations.targetId
WHERE
overall_associations.diseaseId = 'EFO_0000676' AND datasourceId = 'impc'
ORDER BY
overall_associations.score DESC
LIMIT
1;
SELECT t1.*
FROM
(SELECT Trips.trip_id TripId,
Trips.duration_sec TripDuration,
Trips.start_date TripStartDate,
Trips.start_station_name TripStartStation,
Trips.member_gender Gender,
Regions.name RegionName
FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips` Trips
INNER JOIN `bigquery-public-data.san_francisco_bikeshare.bikeshare_station_info` StationInfo
ON CAST(Trips.start_station_id AS STRING) = CAST(StationInfo.station_id AS STRING)
INNER JOIN `bigquery-public-data.san_francisco_bikeshare.bikeshare_regions` Regions
ON StationInfo.region_id = Regions.region_id
WHERE (EXTRACT(YEAR from Trips.start_date)) BETWEEN 2014 AND 2017
)
t1
RIGHT JOIN (SELECT MAX(start_date) TripStartDate,
Regions.name RegionName
FROM `bigquery-public-data.san_francisco_bikeshare.bikeshare_station_info` StationInfo
INNER JOIN `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips` Trips
ON CAST(StationInfo.station_id AS STRING) = CAST(Trips.start_station_id AS STRING)
INNER JOIN `bigquery-public-data.san_francisco_bikeshare.bikeshare_regions` Regions
ON Regions.region_id = StationInfo.region_id
WHERE (EXTRACT(YEAR from Trips.start_date) BETWEEN 2014 AND 2017
AND Regions.name IS NOT NULL)
GROUP BY RegionName)
t2
ON t1.RegionName = t2.RegionName AND t1.TripStartDate = t2.TripStartDate
SELECT
c.country,
c.total_confirmed_cases,
(c.total_confirmed_cases / p.population) * 100000 AS cases_per_100k
FROM
(
SELECT
CASE
WHEN country_region = 'US' THEN 'United States'
WHEN country_region = 'Iran' THEN 'Iran, Islamic Rep.'
ELSE country_region
END AS country,
SUM(confirmed) AS total_confirmed_cases
FROM
`bigquery-public-data.covid19_jhu_csse.summary`
WHERE
date = '2020-04-20'
AND country_region IN ('US', 'France', 'China', 'Italy', 'Spain', 'Germany', 'Iran')
GROUP BY
country
) AS c
JOIN
(
SELECT
country_name AS country,
SUM(value) AS population
FROM
`bigquery-public-data.world_bank_wdi.indicators_data`
WHERE
indicator_code = 'SP.POP.TOTL'
AND year = 2020
GROUP BY
country_name
) AS p
ON
c.country = p.country
ORDER BY
cases_per_100k DESC
WITH
country_pop AS (
SELECT
country_code AS iso_3166_1_alpha_3,
year_2018 AS population_2018
FROM
`bigquery-public-data.world_bank_global_population.population_by_country`)
SELECT
country_code,
country_name,
cumulative_confirmed AS june_confirmed_cases,
population_2018,
ROUND(cumulative_confirmed/population_2018 * 100,2) AS case_percent
FROM
`bigquery-public-data.covid19_open_data.covid19_open_data`
JOIN
country_pop
USING
(iso_3166_1_alpha_3)
WHERE
date = '2020-06-30'
AND aggregation_level = 0
ORDER BY
case_percent DESC
SELECT
table_2019.avg_symptom_Anosmia_2019,
table_2020.avg_symptom_Anosmia_2020,
((table_2020.avg_symptom_Anosmia_2020 - table_2019.avg_symptom_Anosmia_2019) / table_2019.avg_symptom_Anosmia_2019) * 100 AS avg_increase
FROM (
SELECT
AVG(SAFE_CAST(symptom_Anosmia AS FLOAT64)) AS avg_symptom_Anosmia_2020
FROM
`bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_weekly`
WHERE
sub_region_1 = "New York"
AND sub_region_2 IN ("Bronx County", "Queens County", "Kings County", "New York County", "Richmond County")
AND date >= '2020-01-01'
AND date < '2021-01-01'
) AS table_2020,
(
SELECT
AVG(SAFE_CAST(symptom_Anosmia AS FLOAT64)) AS avg_symptom_Anosmia_2019
FROM
`bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_weekly`
WHERE
sub_region_1 = "New York"
AND sub_region_2 IN ("Bronx County", "Queens County", "Kings County", "New York County", "Richmond County")
AND date >= '2019-01-01'
AND date < '2020-01-01'
) AS table_2019
SELECT
table_2019.avg_symptom_Anxiety_2019,
table_2020.avg_symptom_Anxiety_2020,
((table_2020.avg_symptom_Anxiety_2020 - table_2019.avg_symptom_Anxiety_2019)/table_2019.avg_symptom_Anxiety_2019) * 100 AS percent_increase_anxiety,
table_2019.avg_symptom_Depression_2019,
table_2020.avg_symptom_Depression_2020,
((table_2020.avg_symptom_Depression_2020 - table_2019.avg_symptom_Depression_2019)/table_2019.avg_symptom_Depression_2019) * 100 AS percent_increase_depression
FROM (
SELECT
AVG(CAST(symptom_Anxiety AS FLOAT64)) AS avg_symptom_Anxiety_2020,
AVG(CAST(symptom_Depression AS FLOAT64)) AS avg_symptom_Depression_2020,
FROM
`bigquery-public-data.covid19_symptom_search.symptom_search_country_weekly`
WHERE
country_region_code = "US"
AND date >= '2020-01-01'
AND date <'2021-01-01') AS table_2020,
(
SELECT
AVG(CAST(symptom_Anxiety AS FLOAT64)) AS avg_symptom_Anxiety_2019,
AVG(CAST(symptom_Depression AS FLOAT64)) AS avg_symptom_Depression_2019,
FROM
`bigquery-public-data.covid19_symptom_search.symptom_search_country_weekly`
WHERE
country_region_code = "US"
AND date >= '2019-01-01'
AND date <'2020-01-01') AS table_2019
WITH
num_vaccine_sites_per_county AS (
SELECT
facility_sub_region_1 AS us_state,
facility_sub_region_2 AS us_county,
facility_sub_region_2_code AS us_county_fips,
COUNT(DISTINCT facility_place_id) AS num_vaccine_sites
FROM
bigquery-public-data.covid19_vaccination_access.facility_boundary_us_all
WHERE
STARTS_WITH(facility_sub_region_2_code, "06")
GROUP BY
facility_sub_region_1,
facility_sub_region_2,
facility_sub_region_2_code ),
total_population_per_county AS (
SELECT
LEFT(geo_id, 5) AS us_county_fips,
ROUND(SUM(total_pop)) AS total_population
FROM
bigquery-public-data.census_bureau_acs.censustract_2018_5yr
WHERE
STARTS_WITH(LEFT(geo_id, 5), "06")
GROUP BY
LEFT(geo_id, 5) )
SELECT
* EXCEPT(us_county_fips),
ROUND((num_vaccine_sites * 1000) / total_population, 2) AS sites_per_1k_ppl
FROM
num_vaccine_sites_per_county
INNER JOIN
total_population_per_county
USING
(us_county_fips)
ORDER BY
sites_per_1k_ppl ASC
LIMIT
100;
WITH MomentumTrades AS (
SELECT
StrikePrice - LastPx AS priceDifference
FROM
`bigquery-public-data.cymbal_investments.trade_capture_report`
WHERE
SUBSTR(TargetCompID, 0, 4) = 'MOMO'
AND (SELECT Side FROM UNNEST(Sides)) = 'LONG'
),
FeelingLuckyTrades AS (
SELECT
StrikePrice - LastPx AS priceDifference
FROM
`bigquery-public-data.cymbal_investments.trade_capture_report`
WHERE
SUBSTR(TargetCompID, 0, 4) = 'LUCK'
AND (SELECT Side FROM UNNEST(Sides)) = 'LONG'
)
SELECT
AVG(FeelingLuckyTrades.priceDifference) - AVG(MomentumTrades.priceDifference) AS averageDifference
FROM
MomentumTrades,
FeelingLuckyTrades
SELECT
targets.approvedSymbol AS target_symbol,
drugs.name AS drug_name,
source_urls.element.url AS clinical_trial_reference_url,
FROM
`open-targets-prod.platform.evidence` AS evidence,
UNNEST(evidence.urls.list) AS source_urls
JOIN
`open-targets-prod.platform.targets` AS targets
ON
evidence.targetId=targets.id
JOIN
`open-targets-prod.platform.molecule` AS drugs
ON
evidence.drugId=drugs.id
WHERE
datasourceId="chembl"
AND diseaseId="EFO_0007416"
AND evidence.clinicalStatus = "Completed"
WITH tenplus AS (
SELECT
year,
EXTRACT(DAYOFYEAR FROM DATE(eventdate)) AS dayofyear,
COUNT(*) AS count
FROM
bigquery-public-data.gbif.occurrences
WHERE
eventdate IS NOT NULL
AND species = 'Sterna paradisaea'
AND decimallatitude > 40.0
AND month > 1
GROUP BY
year,
eventdate
HAVING
COUNT(*) > 10
)
SELECT
year AS year
FROM
tenplus
GROUP BY
year
ORDER BY
MIN(dayofyear)
LIMIT 1;
WITH bea_2012 AS (
SELECT GeoFIPS, GeoName, Earnings_per_job_avg AS earnings_2012
FROM `bigquery-public-data.sdoh_bea_cainc30.fips`
WHERE Year='2012-01-01' AND ENDS_WITH(GeoName, "MA") IS TRUE
),
bea_2017 AS (
SELECT GeoFIPS, GeoName, Earnings_per_job_avg AS earnings_2017
FROM `bigquery-public-data.sdoh_bea_cainc30.fips`
WHERE Year='2017-01-01' AND ENDS_WITH(GeoName, "MA") IS TRUE
),
earnings_diff AS (
SELECT
bea_2017.GeoFIPS, bea_2017.GeoName, bea_2017.earnings_2017, bea_2012.earnings_2012,
(bea_2017.earnings_2017 - bea_2012.earnings_2012) AS earnings_change
FROM bea_2017
JOIN bea_2012
ON bea_2017.GeoFIPS = bea_2012.GeoFIPS
)
SELECT * FROM earnings_diff WHERE earnings_change IS NOT NULL ORDER BY earnings_change DESC
WITH t2 AS
(
SELECT
t.*,
t.pickup_location_id as pickup_zone_id,
tz.borough as pickup_borough
FROM
(
SELECT *,
TIMESTAMP_DIFF(dropoff_datetime,pickup_datetime,SECOND) as time_duration_in_secs,
(CASE WHEN total_amount=0 THEN 0
ELSE (tip_amount*100/total_amount) END) as tip_rate
FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2016`
) t
INNER JOIN `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom` tz
ON t.pickup_location_id = tz.zone_id
WHERE
pickup_datetime BETWEEN '2016-01-01' AND '2016-01-07'
AND dropoff_datetime BETWEEN '2016-01-01' AND '2016-01-07'
AND TIMESTAMP_DIFF(dropoff_datetime,pickup_datetime,SECOND) > 0
AND passenger_count > 0
AND trip_distance >= 0
AND tip_amount >= 0
AND tolls_amount >= 0
AND mta_tax >= 0
AND fare_amount >= 0
AND total_amount >= 0
),
t3 AS
(SELECT
pickup_borough,
(CASE
WHEN tip_rate = 0 THEN 'no tip'
WHEN tip_rate <= 5 THEN 'Less than 5%'
WHEN tip_rate <= 10 THEN '5% to 10%'
WHEN tip_rate <= 15 THEN '10% to 15%'
WHEN tip_rate <= 20 THEN '15% to 20%'
WHEN tip_rate <= 25 THEN '20% to 25%'
ELSE 'More than 25%' END)as tip_category,
COUNT(*) as no_of_trips
FROM t2
GROUP BY 1,2
ORDER BY pickup_borough ASC),
INFO AS (
SELECT pickup_borough
, tip_category
, Sum(no_of_trips) as no_of_trips,
(CASE
WHEN pickup_borough is null THEN (select sum(no_of_trips)
FROM t3)
WHEN pickup_borough is not null and tip_category is null THEN (select sum(no_of_trips)
FROM t3)
WHEN pickup_borough is not null and tip_category is not null THEN (select sum(no_of_trips)
FROM t3
WHERE pickup_borough = m.pickup_borough)
END) as parent_sum,
(
Sum(no_of_trips)
/
(
CASE
WHEN pickup_borough is null THEN (select sum(no_of_trips)
FROM t3)
WHEN pickup_borough is not null and tip_category is null THEN (select sum(no_of_trips)
FROM t3)
WHEN pickup_borough is not null and tip_category is not null THEN (select sum(no_of_trips)
FROM t3
WHERE pickup_borough = m.pickup_borough)
END
)
) as percentage
FROM t3 m
GROUP BY ROLLUP(pickup_borough, tip_category)
order by 1, 2
)
SELECT
pickup_borough,
(SUM(CASE WHEN tip_category = 'no tip' THEN no_of_trips ELSE 0 END) * 100.0 / SUM(no_of_trips)) AS percentage_no_tip
FROM t3
GROUP BY pickup_borough
ORDER BY pickup_borough;
WITH gene_region AS (
SELECT
MIN(start_position) AS start_pos,
MAX(end_position) AS end_pos
FROM `bigquery-public-data.gnomAD.v2_1_1_genomes__chr17` AS main_table
WHERE EXISTS (
SELECT 1
FROM UNNEST(main_table.alternate_bases) AS alternate_bases
WHERE EXISTS (
SELECT 1
FROM UNNEST(alternate_bases.vep) AS vep
WHERE vep.SYMBOL = 'BRCA1'
)
)
)
SELECT
DISTINCT start_position
FROM `bigquery-public-data.gnomAD.v2_1_1_genomes__chr17` AS main_table,
UNNEST(main_table.alternate_bases) AS alternate_bases,
UNNEST(alternate_bases.vep) AS vep,
gene_region
WHERE main_table.start_position >= gene_region.start_pos
AND main_table.start_position <= gene_region.end_pos
AND REGEXP_CONTAINS(vep.Consequence, r"missense_variant")
AND reference_bases = "C"
AND alternate_bases.alt = "T"
WITH summary_stats AS (
SELECT
COUNT(1) AS num_variants,
SUM((SELECT alt.AC FROM UNNEST(alternate_bases) AS alt)) AS sum_AC,
SUM(AN) AS sum_AN,
-- Also include some information from Variant Effect Predictor (VEP).
STRING_AGG(DISTINCT (SELECT annot.symbol FROM UNNEST(alternate_bases) AS alt,
UNNEST(vep) AS annot LIMIT 1), ', ') AS genes
FROM bigquery-public-data.gnomAD.v3_genomes__chr1 AS main_table
WHERE start_position >= 55039447 AND start_position <= 55064852
)
SELECT
ROUND((55064852 - 55039447) / num_variants, 3) AS burden_of_mutation,
*
FROM summary_stats;
SELECT * FROM
(
SELECT
'2015' AS year,
COUNT(a.consecutive_number) AS total,
a.state_name AS state,
c.state_pop AS population,
(COUNT(a.consecutive_number) / c.state_pop * 100000) AS rate_per_100000
FROM
`bigquery-public-data.nhtsa_traffic_fatalities.accident_2015` a
JOIN
`bigquery-public-data.nhtsa_traffic_fatalities.distract_2015` b
ON
a.consecutive_number = b.consecutive_number
JOIN (
SELECT
SUM(d.population) AS state_pop,
e.state_name AS state
FROM
`bigquery-public-data.census_bureau_usa.population_by_zip_2010` d
JOIN
`bigquery-public-data.utility_us.zipcode_area` e
ON
d.zipcode = e.zipcode
GROUP BY
state ) c
ON
c.state = a.state_name
WHERE
b.driver_distracted_by_name NOT IN ('Not Distracted', 'Unknown if Distracted', 'Not Reported')
GROUP BY
state,
population,
c.state_pop
ORDER BY
rate_per_100000 DESC
LIMIT 5
)
UNION ALL
(
SELECT
'2016' AS year,
COUNT(a.consecutive_number) AS total,
a.state_name AS state,
c.state_pop AS population,
(COUNT(a.consecutive_number) / c.state_pop * 100000) AS rate_per_100000
FROM
`bigquery-public-data.nhtsa_traffic_fatalities.accident_2016` a
JOIN
`bigquery-public-data.nhtsa_traffic_fatalities.distract_2016` b
ON
a.consecutive_number = b.consecutive_number
JOIN (
SELECT
SUM(d.population) AS state_pop,
e.state_name AS state
FROM
`bigquery-public-data.census_bureau_usa.population_by_zip_2010` d
JOIN
`bigquery-public-data.utility_us.zipcode_area` e
ON
d.zipcode = e.zipcode
GROUP BY
state ) c
ON
c.state = a.state_name
WHERE
b.driver_distracted_by_name NOT IN ('Not Distracted', 'Unknown if Distracted', 'Not Reported')
GROUP BY
state,
population,
c.state_pop
ORDER BY
rate_per_100000 DESC
LIMIT 5
)
WITH coloc_stats AS (
SELECT
coloc.coloc_log2_h4_h3,
coloc.right_study AS qtl_source
FROM
`open-targets-genetics.genetics.variant_disease_coloc` AS coloc
JOIN
`open-targets-genetics.genetics.studies` AS studies
ON
coloc.left_study = studies.study_id
WHERE
coloc.right_gene_id = "ENSG00000169174"
AND coloc.coloc_h4 > 0.8
AND coloc.coloc_h3 < 0.02
AND studies.trait_reported LIKE "%lesterol levels%"
AND coloc.right_bio_feature = 'IPSC'
AND CONCAT(coloc.left_chrom, '_', coloc.left_pos, '_', coloc.left_ref, '_', coloc.left_alt) = '1_55029009_C_T'
),
max_value AS (
SELECT
MAX(coloc_log2_h4_h3) AS max_log2_h4_h3
FROM
coloc_stats
)
SELECT
AVG(coloc_log2_h4_h3) AS average,
VAR_SAMP(coloc_log2_h4_h3) AS variance,
MAX(coloc_log2_h4_h3) - MIN(coloc_log2_h4_h3) AS max_min_difference,
(SELECT qtl_source FROM coloc_stats WHERE coloc_log2_h4_h3 = (SELECT max_log2_h4_h3 FROM max_value)) AS qtl_source_of_max
FROM
coloc_stats;
WITH homeless_2012 AS (
SELECT Homeless_Veterans AS Vet12, CoC_Name
FROM `bigquery-public-data.sdoh_hud_pit_homelessness.hud_pit_by_coc`
WHERE SUBSTR(CoC_Number,0,2) = "NY" AND Count_Year = 2012
),
homeless_2018 AS (
SELECT Homeless_Veterans AS Vet18, CoC_Name
FROM `bigquery-public-data.sdoh_hud_pit_homelessness.hud_pit_by_coc`
WHERE SUBSTR(CoC_Number,0,2) = "NY" AND Count_Year = 2018
),
veterans_change AS (
SELECT homeless_2012.COC_Name, Vet12, Vet18, Vet18 - Vet12 AS VetChange
FROM homeless_2018
JOIN homeless_2012
ON homeless_2018.CoC_Name = homeless_2012.CoC_Name
)
SELECT COC_Name, VetChange FROM veterans_change
ORDER BY CoC_Name;
WITH geo AS (
SELECT DISTINCT geo_id
FROM `bigquery-public-data.geo_us_boundaries.counties`
WHERE county_name = "Allegheny"
),
avg_wage_1998 AS(
SELECT
ROUND(AVG(avg_wkly_wage_10_total_all_industries) * 52, 2) AS wages_1998
FROM
`bigquery-public-data.bls_qcew.1998*`
WHERE
geoid = (SELECT geo_id FROM geo) --Selecting Allgeheny County
),
avg_wage_2017 AS (
SELECT
ROUND(AVG(avg_wkly_wage_10_total_all_industries) * 52, 2) AS wages_2017
FROM
`bigquery-public-data.bls_qcew.2017*`
WHERE
geoid = (SELECT geo_id FROM geo) --Selecting Allgeheny County
),
avg_cpi_1998 AS (
SELECT
AVG(value) AS cpi_1998
FROM
`bigquery-public-data.bls.cpi_u` c
WHERE
year = 1998
AND item_code in (
SELECT DISTINCT item_code FROM `bigquery-public-data.bls.cpi_u` WHERE LOWER(item_name) = "all items"
)
AND area_code = (
SELECT DISTINCT area_code FROM `bigquery-public-data.bls.cpi_u` WHERE area_name LIKE '%Pittsburgh%'
)
),
-- A104 is the code for Pittsburgh, PA
-- SA0 is the code for all items
avg_cpi_2017 AS(
SELECT
AVG(value) AS cpi_2017
FROM
`bigquery-public-data.bls.cpi_u` c
WHERE
year = 2017
AND item_code in (
SELECT DISTINCT item_code FROM `bigquery-public-data.bls.cpi_u` WHERE LOWER(item_name) = "all items"
)
AND area_code = (
SELECT DISTINCT area_code FROM `bigquery-public-data.bls.cpi_u` WHERE area_name LIKE '%Pittsburgh%'
)
)
-- A104 is the code for Pittsburgh, PA
-- SA0 is the code for all items
SELECT
ROUND((wages_2017 - wages_1998) / wages_1998 * 100, 2) AS wages_percent_change,
ROUND((cpi_2017 - cpi_1998) / cpi_1998 * 100, 2) AS cpi_percent_change
FROM
avg_wage_2017,
avg_wage_1998,
avg_cpi_2017,
avg_cpi_1998
WITH utah_code AS (
SELECT DISTINCT geo_id
FROM bigquery-public-data.geo_us_boundaries.states
WHERE state_name = 'Utah'
),
e2000 as(
SELECT
AVG(month3_emplvl_23_construction) AS construction_employees_2000,
geoid
FROM
`bigquery-public-data.bls_qcew.2000_*`
WHERE
geoid LIKE CONCAT((SELECT geo_id FROM utah_code), '%')
GROUP BY
geoid),
e2018 AS (
SELECT
AVG(month3_emplvl_23_construction) AS construction_employees_2018,
geoid,
FROM
`bigquery-public-data.bls_qcew.2018_*` e2018
WHERE
geoid LIKE CONCAT((SELECT geo_id FROM utah_code), '%')
GROUP BY
geoid)
SELECT
c.county_name AS county,
(construction_employees_2018 - construction_employees_2000) / construction_employees_2000 * 100 AS increase_rate
FROM
e2000
JOIN
e2018 USING (geoid)
JOIN
`bigquery-public-data.geo_us_boundaries.counties` c ON c.geo_id = e2018.geoid
WHERE
c.state_fips_code = (SELECT geo_id FROM utah_code)
ORDER BY
increase_rate desc
LIMIT 1
SELECT
aq.city,
epa.arithmetic_mean,
aq.value,
aq.timestamp,
(epa.arithmetic_mean - aq.value)
FROM
`bigquery-public-data.openaq.global_air_quality` AS aq
JOIN
`bigquery-public-data.epa_historical_air_quality.air_quality_annual_summary` AS epa
ON
ROUND(aq.latitude, 2) = ROUND(epa.latitude, 2)
AND ROUND(aq.longitude, 2) = ROUND(epa.longitude, 2)
WHERE
epa.units_of_measure = "Micrograms/cubic meter (LC)"
AND epa.parameter_name = "Acceptable PM2.5 AQI & Speciation Mass"
AND epa.year = 1990
AND aq.pollutant = "pm25"
AND EXTRACT(YEAR FROM aq.timestamp) = 2020
ORDER BY
(epa.arithmetic_mean - aq.value) DESC
LIMIT 3
SELECT
country_name
FROM
(SELECT
age.country_name,
SUM(age.population) AS under_25,
pop.midyear_population AS total,
ROUND((SUM(age.population) / pop.midyear_population) * 100,2) AS pct_under_25
FROM (
SELECT
country_name,
population,
country_code
FROM
`bigquery-public-data.census_bureau_international.midyear_population_agespecific`
WHERE
year =2017
AND age < 25) age
INNER JOIN (
SELECT
midyear_population,
country_code
FROM
`bigquery-public-data.census_bureau_international.midyear_population`
WHERE
year = 2017) pop
ON
age.country_code = pop.country_code
GROUP BY
1,
3
ORDER BY
4 DESC
)
LIMIT
1
WITH hurricane_geometry AS (
SELECT
* EXCEPT (longitude, latitude),
ST_GEOGPOINT(longitude, latitude) AS geom,
FROM
`bigquery-public-data.noaa_hurricanes.hurricanes`
WHERE
season = '2020'
AND basin = 'NA'
AND name != 'NOT NAMED'
),
dist_between_points AS (
SELECT
sid,
name,
season,
iso_time,
usa_wind,
geom,
ST_DISTANCE(geom, LAG(geom, 1) OVER (PARTITION BY sid ORDER BY iso_time ASC)) / 1000 AS dist
FROM
hurricane_geometry
),
total_distances AS (
SELECT
sid,
name,
season,
iso_time,
usa_wind,
geom,
SUM(dist) OVER (PARTITION BY sid ORDER BY iso_time ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_distance,
SUM(dist) OVER (PARTITION BY sid) AS total_dist
FROM
dist_between_points
),
ranked_hurricanes AS (
SELECT
*,
DENSE_RANK() OVER (ORDER BY total_dist DESC) AS dense_rank
FROM
total_distances
)
SELECT
geom,cumulative_distance,usa_wind
FROM
ranked_hurricanes
WHERE
dense_rank = 3
ORDER BY
cumulative_distance;
WITH acs_2017 AS (
SELECT geo_id, income_less_10000 AS i10, income_10000_14999 AS i15, income_15000_19999 AS i20
FROM `bigquery-public-data.census_bureau_acs.county_2017_5yr`
),
snap_2017_Jan AS (
SELECT FIPS, SNAP_All_Participation_Households AS snap_total
FROM `bigquery-public-data.sdoh_snap_enrollment.snap_enrollment`
WHERE Date = '2017-01-01'
)
SELECT acs_2017.geo_id, snap_2017_Jan.snap_total,
(acs_2017.i10 + acs_2017.i15 + acs_2017.i20) As households_under_20,
(acs_2017.i10 + acs_2017.i15 + acs_2017.i20)/snap_2017_Jan.snap_total As under_20_snap_ratio
FROM acs_2017
JOIN snap_2017_Jan
ON acs_2017.geo_id = snap_2017_Jan.FIPS
WHERE snap_2017_Jan.snap_total > 0
ORDER BY snap_2017_Jan.snap_total DESC
LIMIT 10
WITH first_answers AS (
SELECT
parent_id AS question_id,
MIN(creation_date) AS first_answer_date
FROM
`bigquery-public-data.stackoverflow.posts_answers`
GROUP BY
parent_id
)
SELECT
FORMAT_DATE('%A', DATE(q.creation_date)) AS question_day,
SUM(CASE WHEN f.first_answer_date IS NOT NULL AND TIMESTAMP_DIFF(f.first_answer_date, q.creation_date, MINUTE) <= 60 THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS percent_questions
FROM
`bigquery-public-data.stackoverflow.posts_questions` q
LEFT JOIN
first_answers f
ON
q.id = f.question_id
GROUP BY
question_day
ORDER BY
percent_questions DESC
LIMIT 1 OFFSET 2
With INFO AS (
SELECT
MR.patientId,
P.last_name,
ARRAY_TO_STRING(P.first_name, " ") AS First_name,
Condition.Codes,
Condition.Conditions,
MR.med_count AS COUNT_NUMBER
FROM
(SELECT
id,
name[safe_offset(0)].family as last_name,
name[safe_offset(0)].given as first_name,
TIMESTAMP(deceased.dateTime) AS deceased_datetime
FROM `bigquery-public-data.fhir_synthea.patient`) AS P
JOIN
(SELECT subject.patientId as patientId,
COUNT(DISTINCT medication.codeableConcept.coding[safe_offset(0)].code) AS med_count
FROM `bigquery-public-data.fhir_synthea.medication_request`
WHERE status = 'active'
GROUP BY 1
) AS MR
ON MR.patientId = P.id
JOIN
(SELECT
PatientId,
STRING_AGG(DISTINCT condition_desc, ", ") AS Conditions,
STRING_AGG(DISTINCT condition_code, ", ") AS Codes
FROM(
SELECT
subject.patientId as PatientId,
code.coding[safe_offset(0)].code condition_code,
code.coding[safe_offset(0)].display condition_desc
FROM `bigquery-public-data.fhir_synthea.condition`
wHERE
code.coding[safe_offset(0)].display = 'Diabetes'
OR
code.coding[safe_offset(0)].display = 'Hypertension'
)
GROUP BY PatientId
) AS Condition
ON MR.patientId = Condition.PatientId
WHERE med_count >= 7
AND P.deceased_datetime is NULL /*only alive patients*/
GROUP BY patientId, last_name, first_name, Condition.Codes, Condition.Conditions, MR.med_count
ORDER BY last_name
)
SELECT COUNT(*) FROM INFO
SELECT
o.artist_display_name,
o.title,
o.object_end_date,
o.medium,
i.original_image_url
FROM (
SELECT
object_id,
title,
artist_display_name,
object_end_date,
medium
FROM
`bigquery-public-data.the_met.objects`
WHERE
department = "Photographs"
AND object_name LIKE "%Photograph%"
AND artist_display_name != "Unknown"
AND object_end_date <= 1839
) o
INNER JOIN (
SELECT
original_image_url,
object_id
FROM
`bigquery-public-data.the_met.images`
) i
ON
o.object_id = i.object_id
ORDER BY
o.object_end_date
;
WITH StateCases AS (
SELECT
b.state_name,
b.date,
b.confirmed_cases - a.confirmed_cases AS daily_new_cases
FROM
(SELECT
state_name,
state_fips_code,
confirmed_cases,
DATE_ADD(date, INTERVAL 1 DAY) AS date_shift
FROM
`bigquery-public-data.covid19_nyt.us_states`
WHERE
date >= '2020-02-29' AND date <= '2020-05-30'
) a
JOIN
`bigquery-public-data.covid19_nyt.us_states` b
ON a.state_fips_code = b.state_fips_code AND a.date_shift = b.date
WHERE
b.date >= '2020-03-01' AND b.date <= '2020-05-31'
),
RankedStatesPerDay AS (
SELECT
state_name,
date,
daily_new_cases,
RANK() OVER (PARTITION BY date ORDER BY daily_new_cases DESC) as rank
FROM
StateCases
),
TopStates AS (
SELECT
state_name,
COUNT(*) AS appearance_count
FROM
RankedStatesPerDay
WHERE
rank <= 5
GROUP BY
state_name
ORDER BY
appearance_count DESC
),
FourthState AS (
SELECT
state_name
FROM
TopStates
LIMIT 1
OFFSET 3
),
CountyCases AS (
SELECT
b.county,
b.date,
b.confirmed_cases - a.confirmed_cases AS daily_new_cases
FROM
(SELECT
county,
county_fips_code,
confirmed_cases,
DATE_ADD(date, INTERVAL 1 DAY) AS date_shift
FROM
`bigquery-public-data.covid19_nyt.us_counties`
WHERE
date >= '2020-02-29' AND date <= '2020-05-30'
) a
JOIN
`bigquery-public-data.covid19_nyt.us_counties` b
ON a.county_fips_code = b.county_fips_code AND a.date_shift = b.date
WHERE
b.date >= '2020-03-01' AND b.date <= '2020-05-31'
AND b.state_name = (SELECT state_name FROM FourthState)
),
RankedCountiesPerDay AS (
SELECT
county,
date,
daily_new_cases,
RANK() OVER (PARTITION BY date ORDER BY daily_new_cases DESC) as rank
FROM
CountyCases
),
TopCounties AS (
SELECT
county,
COUNT(*) AS appearance_count
FROM
RankedCountiesPerDay
WHERE
rank <= 5
GROUP BY
county
ORDER BY
appearance_count DESC
LIMIT 5
)
SELECT
county
FROM
TopCounties;
WITH
quant AS (
SELECT
meta.sample_submitter_id,
meta.sample_type,
quant.case_id,
quant.aliquot_id,
quant.gene_symbol,
CAST(quant.protein_abundance_log2ratio AS FLOAT64) AS protein_abundance_log2ratio
FROM
`isb-cgc-bq.CPTAC.quant_proteome_CPTAC_CCRCC_discovery_study_pdc_current` AS quant
JOIN
`isb-cgc-bq.PDC_metadata.aliquot_to_case_mapping_current` AS meta
ON quant.case_id = meta.case_id
AND quant.aliquot_id = meta.aliquot_id
AND meta.sample_type IN ('Primary Tumor', 'Solid Tissue Normal')
),
gexp AS (
SELECT DISTINCT
meta.sample_submitter_id,
meta.sample_type,
rnaseq.gene_name,
LOG(rnaseq.fpkm_unstranded + 1) AS HTSeq__FPKM -- Confirm the correct column name here
FROM
`isb-cgc-bq.CPTAC.RNAseq_hg38_gdc_current` AS rnaseq
JOIN
`isb-cgc-bq.PDC_metadata.aliquot_to_case_mapping_current` AS meta
ON meta.sample_submitter_id = rnaseq.sample_barcode
),
correlation AS (
SELECT
quant.gene_symbol,
gexp.sample_type,
COUNT(*) AS n,
CORR(protein_abundance_log2ratio, HTSeq__FPKM) AS corr -- Confirm the correct column name here
FROM
quant
JOIN
gexp
ON quant.sample_submitter_id = gexp.sample_submitter_id
AND gexp.gene_name = quant.gene_symbol
AND gexp.sample_type = quant.sample_type
GROUP BY
quant.gene_symbol, gexp.sample_type
),
pval AS (
SELECT
gene_symbol,
sample_type,
n,
corr
FROM
correlation
WHERE
ABS(corr) <= 0.5
)
SELECT sample_type, AVG(corr)
FROM pval
GROUP BY sample_type;
WITH outcomes AS (
SELECT
season, # 1994
"win" AS label, # our label
win_seed AS seed, # ranking # this time without seed even
win_school_ncaa AS school_ncaa,
lose_seed AS opponent_seed, # ranking
lose_school_ncaa AS opponent_school_ncaa
FROM `data-to-insights.ncaa.mbb_historical_tournament_games` t
WHERE season >= 2014
UNION ALL
SELECT
season, # 1994
"loss" AS label, # our label
lose_seed AS seed, # ranking
lose_school_ncaa AS school_ncaa,
win_seed AS opponent_seed, # ranking
win_school_ncaa AS opponent_school_ncaa
FROM
`data-to-insights.ncaa.mbb_historical_tournament_games` t
WHERE season >= 2014
UNION ALL
SELECT
season,
label,
seed,
school_ncaa,
opponent_seed,
opponent_school_ncaa
FROM
`data-to-insights.ncaa.2018_tournament_results`
)
SELECT
o.season,
label,
seed,
school_ncaa,
team.pace_rank,
team.poss_40min,
team.pace_rating,
team.efficiency_rank,
team.pts_100poss,
team.efficiency_rating,
opponent_seed,
opponent_school_ncaa,
opp.pace_rank AS opp_pace_rank,
opp.poss_40min AS opp_poss_40min,
opp.pace_rating AS opp_pace_rating,
opp.efficiency_rank AS opp_efficiency_rank,
opp.pts_100poss AS opp_pts_100poss,
opp.efficiency_rating AS opp_efficiency_rating,
opp.pace_rank - team.pace_rank AS pace_rank_diff,
opp.poss_40min - team.poss_40min AS pace_stat_diff,
opp.pace_rating - team.pace_rating AS pace_rating_diff,
opp.efficiency_rank - team.efficiency_rank AS eff_rank_diff,
opp.pts_100poss - team.pts_100poss AS eff_stat_diff,
opp.efficiency_rating - team.efficiency_rating AS eff_rating_diff
FROM outcomes AS o
LEFT JOIN `data-to-insights.ncaa.feature_engineering` AS team
ON o.school_ncaa = team.team AND o.season = team.season
LEFT JOIN `data-to-insights.ncaa.feature_engineering` AS opp
ON o.opponent_school_ncaa = opp.team AND o.season = opp.season
WITH
barcodes AS (
SELECT bcr_patient_barcode AS ParticipantBarcode
FROM isb-cgc-bq.pancancer_atlas.Filtered_clinical_PANCAN_patient_with_followup
WHERE acronym = 'PAAD'
),
table1 AS (
SELECT
t1.ParticipantBarcode,
IF(t2.ParticipantBarcode IS NULL, 'NO', 'YES') AS data
FROM
barcodes AS t1
LEFT JOIN
(
SELECT
ParticipantBarcode AS ParticipantBarcode
FROM isb-cgc-bq.pancancer_atlas.Filtered_MC3_MAF_V5_one_per_tumor_sample
WHERE Study = 'PAAD' AND Hugo_Symbol = 'KRAS'
AND FILTER = 'PASS'
GROUP BY ParticipantBarcode
) AS t2
ON t1.ParticipantBarcode = t2.ParticipantBarcode
),
table2 AS (
SELECT
t1.ParticipantBarcode,
IF(t2.ParticipantBarcode IS NULL, 'NO', 'YES') AS data
FROM
barcodes AS t1
LEFT JOIN
(
SELECT
ParticipantBarcode AS ParticipantBarcode
FROM isb-cgc-bq.pancancer_atlas.Filtered_MC3_MAF_V5_one_per_tumor_sample
WHERE Study = 'PAAD' AND Hugo_Symbol = 'TP53'
AND FILTER = 'PASS'
GROUP BY ParticipantBarcode
) AS t2
ON t1.ParticipantBarcode = t2.ParticipantBarcode
),
summ_table AS (
SELECT
n1.data AS data1,
n2.data AS data2,
COUNT(*) AS Nij
FROM
table1 AS n1
INNER JOIN
table2 AS n2
ON
n1.ParticipantBarcode = n2.ParticipantBarcode
GROUP BY
data1, data2
),
contingency_table AS (
SELECT
MAX(IF((data1 = 'YES') AND (data2 = 'YES'), Nij, 0)) AS a,
MAX(IF((data1 = 'YES') AND (data2 = 'NO'), Nij, 0)) AS b,
MAX(IF((data1 = 'NO') AND (data2 = 'YES'), Nij, 0)) AS c,
MAX(IF((data1 = 'NO') AND (data2 = 'NO'), Nij, 0)) AS d,
(MAX(IF((data1 = 'YES') AND (data2 = 'YES'), Nij, 0)) + MAX(IF((data1 = 'YES') AND (data2 = 'NO'), Nij, 0))) AS row1_total,
(MAX(IF((data1 = 'NO') AND (data2 = 'YES'), Nij, 0)) + MAX(IF((data1 = 'NO') AND (data2 = 'NO'), Nij, 0))) AS row2_total,
(MAX(IF((data1 = 'YES') AND (data2 = 'YES'), Nij, 0)) + MAX(IF((data1 = 'NO') AND (data2 = 'YES'), Nij, 0))) AS col1_total,
(MAX(IF((data1 = 'YES') AND (data2 = 'NO'), Nij, 0)) + MAX(IF((data1 = 'NO') AND (data2 = 'NO'), Nij, 0))) AS col2_total,
SUM(Nij) AS grand_total
FROM summ_table
)
SELECT
POWER((a - (row1_total * col1_total) / grand_total), 2) / ((row1_total * col1_total) / grand_total) +
POWER((b - (row1_total * col2_total) / grand_total), 2) / ((row1_total * col2_total) / grand_total) +
POWER((c - (row2_total * col1_total) / grand_total), 2) / ((row2_total * col1_total) / grand_total) +
POWER((d - (row2_total * col2_total) / grand_total), 2) / ((row2_total * col2_total) / grand_total) AS chi_square_statistic
FROM contingency_table
WHERE a IS NOT NULL AND b IS NOT NULL AND c IS NOT NULL AND d IS NOT NULL;
WITH
barcodes AS (
SELECT bcr_patient_barcode AS ParticipantBarcode
FROM `isb-cgc-bq.pancancer_atlas.Filtered_clinical_PANCAN_patient_with_followup`
WHERE acronym = 'PAAD'
)
,table1 AS (
SELECT
t1.ParticipantBarcode,
IF( t2.ParticipantBarcode is null, 'NO', 'YES') as data
FROM
barcodes AS t1
LEFT JOIN
(
SELECT
ParticipantBarcode AS ParticipantBarcode
FROM `isb-cgc-bq.pancancer_atlas.Filtered_MC3_MAF_V5_one_per_tumor_sample`
WHERE Study = 'PAAD' AND Hugo_Symbol = 'KRAS'
AND FILTER = 'PASS'
GROUP BY ParticipantBarcode
) AS t2
ON t1.ParticipantBarcode = t2.ParticipantBarcode
)
,table2 AS (
SELECT
t1.ParticipantBarcode,
IF( t2.ParticipantBarcode is null, 'NO', 'YES') as data
FROM
barcodes AS t1
LEFT JOIN
(
SELECT
ParticipantBarcode AS ParticipantBarcode
FROM `isb-cgc-bq.pancancer_atlas.Filtered_MC3_MAF_V5_one_per_tumor_sample`
WHERE Study = 'PAAD' AND Hugo_Symbol = 'TP53'
AND FILTER = 'PASS'
GROUP BY ParticipantBarcode
) AS t2
ON t1.ParticipantBarcode = t2.ParticipantBarcode
),
INFO AS (
SELECT
n1.data as data1,
n2.data as data2,
COUNT(*) as Nij
FROM
table1 AS n1
INNER JOIN
table2 AS n2
ON
n1.ParticipantBarcode = n2.ParticipantBarcode
GROUP BY
data1, data2
)
SELECT
(SELECT Nij FROM INFO WHERE data1="YES" AND data2="YES")
-
(SELECT Nij FROM INFO WHERE data1="NO" AND data2="NO")
WITH ny_top_drug AS (
SELECT
drug_name AS drug_name,
ROUND(SUM(total_claim_count)) AS total_claim_count
FROM
`bigquery-public-data.cms_medicare.part_d_prescriber_2014`
WHERE
nppes_provider_state = 'NY'
GROUP BY
drug_name
ORDER BY
total_claim_count DESC
LIMIT 1
),
top_5_states AS (
SELECT
nppes_provider_state AS state,
SUM(total_claim_count) AS total_claim_count,
SUM(total_drug_cost) AS total_drug_cost
FROM
`bigquery-public-data.cms_medicare.part_d_prescriber_2014`
WHERE
drug_name = (SELECT drug_name FROM ny_top_drug)
GROUP BY
state
ORDER BY
total_claim_count DESC
LIMIT 5
)
SELECT
state,
total_claim_count,
total_drug_cost
FROM
top_5_states;
SELECT
AVG(TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, SECOND) / 60.0) AS average_trip_duration_in_minutes
FROM
(
SELECT *
FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2016` t
WHERE
pickup_datetime BETWEEN '2016-02-01' AND '2016-02-07' AND
dropoff_datetime BETWEEN '2016-02-01' AND '2016-02-07' AND
TIMESTAMP_DIFF(dropoff_datetime, pickup_datetime, SECOND) > 0 AND
passenger_count > 3 AND
trip_distance >= 10
) t
INNER JOIN `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom` tz
ON t.pickup_location_id = tz.zone_id
INNER JOIN `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom` tz1
ON t.dropoff_location_id = tz1.zone_id
WHERE
tz.borough = "Brooklyn" AND
tz1.borough = "Brooklyn";
SELECT
team_name,
COUNT(*) AS top_performer_count
FROM (
SELECT
DISTINCT c2.season,
c2.market AS team_name
FROM (
SELECT
season AS a,
MAX(wins) AS win_max
FROM
`bigquery-public-data.ncaa_basketball.mbb_historical_teams_seasons`
WHERE
season<=2000
AND season >=1900
GROUP BY
season ),
`bigquery-public-data.ncaa_basketball.mbb_historical_teams_seasons` c2
WHERE
win_max = c2.wins
AND a = c2.season
AND c2.market IS NOT NULL
ORDER BY
c2.season)
GROUP BY
team_name
ORDER BY
top_performer_count DESC,
team_name
LIMIT
5
WITH price_2020 AS (
SELECT
category_name AS category,
AVG(state_bottle_retail / (bottle_volume_ml / 1000)) AS avg_price_liter_2020
FROM
`bigquery-public-data.iowa_liquor_sales.sales`
WHERE
bottle_volume_ml > 0
AND EXTRACT(YEAR FROM date) = 2020
GROUP BY
category
),
price_2019 AS (
SELECT
category_name AS category,
AVG(state_bottle_retail / (bottle_volume_ml / 1000)) AS avg_price_liter_2019
FROM
`bigquery-public-data.iowa_liquor_sales.sales`
WHERE
bottle_volume_ml > 0
AND EXTRACT(YEAR FROM date) = 2019
GROUP BY
category
),
price_2021 AS (
SELECT
category_name AS category,
AVG(state_bottle_retail / (bottle_volume_ml / 1000)) AS avg_price_liter_2021
FROM
`bigquery-public-data.iowa_liquor_sales.sales`
WHERE
bottle_volume_ml > 0
AND EXTRACT(YEAR FROM date) = 2021
GROUP BY
category
)
SELECT
price_2021.category,
price_2019.avg_price_liter_2019,
price_2020.avg_price_liter_2020,
price_2021.avg_price_liter_2021
FROM
price_2021
LEFT JOIN
price_2019 ON price_2021.category = price_2019.category
LEFT JOIN
price_2020 ON price_2021.category = price_2020.category
ORDER BY
price_2021.avg_price_liter_2021 DESC
LIMIT
10;
WITH stations_n_entrances AS (
SELECT borough_name,s.station_name,entry,ada_compliant
FROM `bigquery-public-data.new_york_subway.stations` s
JOIN `bigquery-public-data.new_york_subway.station_entrances` se
ON s.station_name = se.station_name
)
SELECT se.borough_name, COUNT(DISTINCT se.station_name) num_stations,
COUNT(DISTINCT adas.station_name) num_stations_w_compliant_entrance,
(100*COUNT(DISTINCT adas.station_name))/(COUNT(DISTINCT se.station_name)) percent_compliant_stations
FROM `stations_n_entrances` se
LEFT JOIN `stations_n_entrances` adas
ON se.station_name = adas.station_name
AND adas.entry AND adas.ada_compliant
GROUP BY 1
ORDER BY 4 DESC
SELECT user
FROM (
Select user
From `bigquery-public-data.eclipse_megamovie.photos_v_0_1`
UNION ALL
Select user
From`bigquery-public-data.eclipse_megamovie.photos_v_0_2`
UNION ALL
Select user
From`bigquery-public-data.eclipse_megamovie.photos_v_0_3`
)
GROUP BY user
HAVING COUNT (user)=(
SELECT MAX(mycount)
FROM (
SELECT user, COUNT(user) mycount
FROM (
Select user
From `bigquery-public-data.eclipse_megamovie.photos_v_0_1`
UNION ALL
Select user
From`bigquery-public-data.eclipse_megamovie.photos_v_0_2`
UNION ALL
Select user
From`bigquery-public-data.eclipse_megamovie.photos_v_0_3`
)
GROUP BY user))
ORDER BY COUNT(user)
LIMIT 1
WITH AnnualSales AS (
SELECT
item_description,
EXTRACT(YEAR FROM date) AS year,
SUM(sale_dollars) AS total_sales_revenue,
COUNT(DISTINCT invoice_and_item_number) AS unique_purchases
FROM
`bigquery-public-data.iowa_liquor_sales.sales`
WHERE
EXTRACT(YEAR FROM date) IN (2022, 2023)
AND item_description IS NOT NULL
AND sale_dollars IS NOT NULL
GROUP BY
item_description, year
),
YoYGrowth AS (
SELECT
curr.item_description,
curr.year,
curr.total_sales_revenue,
curr.unique_purchases,
LAG(curr.total_sales_revenue) OVER(PARTITION BY curr.item_description ORDER BY curr.year) AS prev_year_sales_revenue,
(curr.total_sales_revenue - LAG(curr.total_sales_revenue) OVER(PARTITION BY curr.item_description ORDER BY curr.year)) / LAG(curr.total_sales_revenue) OVER(PARTITION BY curr.item_description ORDER BY curr.year) * 100 AS yoy_growth_percentage
FROM
AnnualSales curr
),
total_info AS (
SELECT
item_description,
year,
total_sales_revenue,
unique_purchases,
prev_year_sales_revenue,
yoy_growth_percentage
FROM
YoYGrowth
WHERE
year = 2023
AND prev_year_sales_revenue IS NOT NULL -- Exclude rows where there's no previous year data to calculate YoY growth
ORDER BY
year, total_sales_revenue
DESC
)
SELECT item_description
FROM total_info
order by yoy_growth_percentage
DESC
LIMIT 5
WITH top5_categories AS (
SELECT minor_category
FROM `bigquery-public-data.london_crime.crime_by_lsoa`
WHERE year = 2008
GROUP BY minor_category
ORDER BY SUM(value) DESC
LIMIT 5
),
total_crimes_per_year AS (
SELECT
year,
SUM(value) AS total_crimes_year
FROM `bigquery-public-data.london_crime.crime_by_lsoa`
GROUP BY year
),
top5_crimes_per_year AS (
SELECT
year,
minor_category,
SUM(value) AS total_crimes_category_year
FROM `bigquery-public-data.london_crime.crime_by_lsoa`
WHERE minor_category IN (SELECT minor_category FROM top5_categories)
GROUP BY year, minor_category
)
SELECT
t.year,
ROUND(SUM(CASE WHEN t.minor_category = (SELECT minor_category FROM top5_categories LIMIT 1 OFFSET 0) THEN t.total_crimes_category_year ELSE 0 END) / y.total_crimes_year * 100, 2) AS `Category 1`,
ROUND(SUM(CASE WHEN t.minor_category = (SELECT minor_category FROM top5_categories LIMIT 1 OFFSET 1) THEN t.total_crimes_category_year ELSE 0 END) / y.total_crimes_year * 100, 2) AS `Category 2`,
ROUND(SUM(CASE WHEN t.minor_category = (SELECT minor_category FROM top5_categories LIMIT 1 OFFSET 2) THEN t.total_crimes_category_year ELSE 0 END) / y.total_crimes_year * 100, 2) AS `Category 3`,
ROUND(SUM(CASE WHEN t.minor_category = (SELECT minor_category FROM top5_categories LIMIT 1 OFFSET 3) THEN t.total_crimes_category_year ELSE 0 END) / y.total_crimes_year * 100, 2) AS `Category 4`,
ROUND(SUM(CASE WHEN t.minor_category = (SELECT minor_category FROM top5_categories LIMIT 1 OFFSET 4) THEN t.total_crimes_category_year ELSE 0 END) / y.total_crimes_year * 100, 2) AS `Category 5`
FROM
top5_crimes_per_year t
JOIN
total_crimes_per_year y ON t.year = y.year
GROUP BY
t.year, y.total_crimes_year
ORDER BY
t.year;
WITH ranked_crimes AS (
SELECT
borough,
major_category,
RANK() OVER(PARTITION BY borough ORDER BY SUM(value) DESC) AS rank_per_borough,
SUM(value) AS no_of_incidents
FROM
`bigquery-public-data.london_crime.crime_by_lsoa`
GROUP BY
borough,
major_category
)
SELECT
borough,
major_category,
rank_per_borough,
no_of_incidents
FROM
ranked_crimes
WHERE
rank_per_borough <= 3
AND
borough = 'Barking and Dagenham'
ORDER BY
borough,
rank_per_borough;
WITH borough_data AS (
SELECT
year,
month,
borough,
major_category,
minor_category,
SUM(value) AS total,
CASE
WHEN
major_category = 'Theft and Handling'
THEN
'Theft and Handling'
ELSE
'Other'
END AS major_division,
CASE
WHEN
minor_category = 'Other Theft' THEN minor_category
ELSE
'Other'
END AS minor_division,
FROM
bigquery-public-data.london_crime.crime_by_lsoa
GROUP BY 1,2,3,4,5
ORDER BY 1,2
)
SELECT year, SUM(total) AS year_total
FROM borough_data
WHERE
borough = 'Westminster'
AND
major_division != 'Other'
AND
minor_division != 'Other'
GROUP BY year, major_division, minor_division
ORDER BY year;
SELECT
A.state,
drug_name,
total_claim_count
FROM (
SELECT
generic_name AS drug_name,
nppes_provider_state AS state,
ROUND(SUM(total_claim_count)) AS total_claim_count,
ROUND(SUM(total_day_supply)) AS day_supply,
ROUND(SUM(total_drug_cost)) / 1e6 AS total_cost_millions
FROM
`bigquery-public-data.cms_medicare.part_d_prescriber_2014`
GROUP BY
state,
drug_name) A
INNER JOIN (
SELECT
state,
MAX(total_claim_count) AS max_total_claim_count
FROM (
SELECT
nppes_provider_state AS state,
ROUND(SUM(total_claim_count)) AS total_claim_count
FROM
`bigquery-public-data.cms_medicare.part_d_prescriber_2014`
GROUP BY
state,
generic_name)
GROUP BY
state) B
ON
A.state = B.state
AND A.total_claim_count = B.max_total_claim_count;
SELECT
Provider_Name
FROM
(
SELECT
OP.provider_state AS State,
OP.provider_city AS City,
OP.provider_id AS Provider_ID,
OP.provider_name AS Provider_Name,
ROUND(OP.average_OP_cost) AS Average_OP_Cost,
ROUND(IP.average_IP_cost) AS Average_IP_Cost,
ROUND(OP.average_OP_cost + IP.average_IP_cost) AS Combined_Average_Cost
FROM (
SELECT
provider_state,
provider_city,
provider_id,
provider_name,
SUM(average_total_payments*outpatient_services)/SUM(outpatient_services) AS average_OP_cost
FROM
`bigquery-public-data.cms_medicare.outpatient_charges_2014`
GROUP BY
provider_state,
provider_city,
provider_id,
provider_name ) AS OP
INNER JOIN (
SELECT
provider_state,
provider_city,
provider_id,
provider_name,
SUM(average_medicare_payments*total_discharges)/SUM(total_discharges) AS average_IP_cost
FROM
`bigquery-public-data.cms_medicare.inpatient_charges_2014`
GROUP BY
provider_state,
provider_city,
provider_id,
provider_name ) AS IP
ON
OP.provider_id = IP.provider_id
AND OP.provider_state = IP.provider_state
AND OP.provider_city = IP.provider_city
AND OP.provider_name = IP.provider_name
ORDER BY
combined_average_cost DESC
LIMIT
1
);
WITH
visit AS (
SELECT fullvisitorid, MIN(date) AS date_first_visit, MAX(date) AS date_last_visit
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` GROUP BY fullvisitorid),
device_visit AS (
SELECT DISTINCT fullvisitorid, date, device.deviceCategory
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`),
transactions AS (
SELECT fullvisitorid, MIN(date) AS date_transactions, 1 AS transaction
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` AS ga, UNNEST(ga.hits) AS hits
WHERE hits.transaction.transactionId IS NOT NULL GROUP BY fullvisitorid),
device_transactions AS (
SELECT DISTINCT fullvisitorid, date, device.deviceCategory
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` AS ga, UNNEST(ga.hits) AS hits
WHERE hits.transaction.transactionId IS NOT NULL),
visits_transactions AS (
SELECT visit.fullvisitorid, date_first_visit, date_transactions, date_last_visit ,
device_visit.deviceCategory AS device_last_visit, device_transactions.deviceCategory AS device_transaction,
IFNULL(transactions.transaction,0) AS transaction
FROM visit LEFT JOIN transactions ON visit.fullvisitorid = transactions.fullvisitorid
LEFT JOIN device_visit ON visit.fullvisitorid = device_visit.fullvisitorid
AND visit.date_last_visit = device_visit.date
LEFT JOIN device_transactions ON visit.fullvisitorid = device_transactions.fullvisitorid
AND transactions.date_transactions = device_transactions.date ),
mortality_table AS (
SELECT fullvisitorid, date_first_visit,
CASE WHEN date_transactions IS NULL THEN date_last_visit ELSE date_transactions END AS date_event,
CASE WHEN device_transaction IS NULL THEN device_last_visit ELSE device_transaction END AS device, transaction
FROM visits_transactions )
SELECT DATE_DIFF(PARSE_DATE('%Y%m%d',date_event), PARSE_DATE('%Y%m%d', date_first_visit),DAY) AS time
FROM mortality_table
WHERE device = 'mobile'
ORDER BY DATE_DIFF(PARSE_DATE('%Y%m%d',date_event), PARSE_DATE('%Y%m%d', date_first_visit),DAY) DESC
LIMIT 1
WITH visitor_pageviews AS (
SELECT
FORMAT_DATE('%Y%m', PARSE_DATE('%Y%m%d', date)) AS month,
CASE WHEN totals.transactions > 0 THEN 'purchase' ELSE 'non_purchase' END AS purchase_status,
fullVisitorId,
SUM(totals.pageviews) AS total_pageviews
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN '20170601' AND '20170731'
AND totals.pageviews IS NOT NULL
GROUP BY
month, purchase_status, fullVisitorId
),
avg_pageviews AS (
SELECT
month,
purchase_status,
AVG(total_pageviews) AS avg_pageviews_per_visitor
FROM
visitor_pageviews
GROUP BY
month, purchase_status
)
SELECT
month,
MAX(CASE WHEN purchase_status = 'purchase' THEN avg_pageviews_per_visitor END) AS avg_pageviews_purchase,
MAX(CASE WHEN purchase_status = 'non_purchase' THEN avg_pageviews_per_visitor END) AS avg_pageviews_non_purchase
FROM
avg_pageviews
GROUP BY
month
ORDER BY
month
WITH
cte1 AS
(SELECT
CONCAT(EXTRACT(YEAR FROM (PARSE_DATE('%Y%m%d', date))),'0',
EXTRACT(MONTH FROM (PARSE_DATE('%Y%m%d', date)))) AS month,
COUNT(hits.eCommerceAction.action_type) AS num_product_view
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_2017*`,
UNNEST(hits) AS hits
WHERE _table_suffix BETWEEN '0101' AND '0331'
AND hits.eCommerceAction.action_type = '2'
GROUP BY month),
cte2 AS
(SELECT
CONCAT(EXTRACT(YEAR FROM (PARSE_DATE('%Y%m%d', date))),'0',
EXTRACT(MONTH FROM (PARSE_DATE('%Y%m%d', date)))) AS month,
COUNT(hits.eCommerceAction.action_type) AS num_addtocart
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_2017*`,
UNNEST(hits) AS hits
WHERE _table_suffix BETWEEN '0101' AND '0331'
AND hits.eCommerceAction.action_type = '3'
GROUP BY month),
cte3 AS
(SELECT
CONCAT(EXTRACT(YEAR FROM (PARSE_DATE('%Y%m%d', date))),'0',
EXTRACT(MONTH FROM (PARSE_DATE('%Y%m%d', date)))) AS month,
COUNT(hits.eCommerceAction.action_type) AS num_purchase
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_2017*`,
UNNEST(hits) AS hits,
UNNEST(hits.product) AS product
WHERE _table_suffix BETWEEN '0101' AND '0331'
AND hits.eCommerceAction.action_type = '6'
AND product.productRevenue IS NOT NULL
GROUP BY month)
SELECT
ROUND((num_addtocart/num_product_view * 100),2) AS add_to_cart_rate,
ROUND((num_purchase/num_product_view * 100),2) AS purchase_rate
FROM cte1
LEFT JOIN cte2
USING(month)
LEFT JOIN cte3
USING(month)
ORDER BY month;
WITH
visit AS (
SELECT fullvisitorid, MIN(date) AS date_first_visit
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`
GROUP BY fullvisitorid
),
transactions AS (
SELECT fullvisitorid, MIN(date) AS date_transactions, 1 AS transaction
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` AS ga,
UNNEST(ga.hits) AS hits
WHERE hits.transaction.transactionId IS NOT NULL
GROUP BY fullvisitorid
),
device_transactions AS (
SELECT DISTINCT fullvisitorid, date, device.deviceCategory AS device_transaction
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*` AS ga,
UNNEST(ga.hits) AS hits
WHERE hits.transaction.transactionId IS NOT NULL
),
visits_transactions AS (
SELECT visit.fullvisitorid, date_first_visit, date_transactions, device_transaction
FROM visit
LEFT JOIN transactions ON visit.fullvisitorid = transactions.fullvisitorid
LEFT JOIN device_transactions ON visit.fullvisitorid = device_transactions.fullvisitorid
AND transactions.date_transactions = device_transactions.date
)
SELECT fullvisitorid
FROM visits_transactions
WHERE DATE_DIFF(PARSE_DATE('%Y%m%d', date_transactions), PARSE_DATE('%Y%m%d', date_first_visit), DAY) > 0
AND device_transaction = "mobile";
SELECT
t.year,
CASE
WHEN t.year = 2013 THEN (
SELECT
COUNT(DISTINCT station_id)
FROM
`bigquery-public-data.austin_bikeshare.bikeshare_trips` t
INNER JOIN
`bigquery-public-data.austin_bikeshare.bikeshare_stations` s
ON
t.start_station_id = s.station_id
WHERE
s.status = 'active' AND EXTRACT(YEAR FROM start_time) = 2013
)
WHEN t.year = 2014 THEN (
SELECT
COUNT(DISTINCT station_id)
FROM
`bigquery-public-data.austin_bikeshare.bikeshare_trips` t
INNER JOIN
`bigquery-public-data.austin_bikeshare.bikeshare_stations` s
ON
t.start_station_id = s.station_id
WHERE
s.status = 'active' AND EXTRACT(YEAR FROM start_time) = 2014
)
END
AS number_status_active,
CASE
WHEN t.year = 2013 THEN (
SELECT
COUNT(DISTINCT station_id)
FROM
`bigquery-public-data.austin_bikeshare.bikeshare_trips` t
INNER JOIN
`bigquery-public-data.austin_bikeshare.bikeshare_stations` s
ON
t.start_station_id = s.station_id
WHERE
s.status = 'closed' AND EXTRACT(YEAR FROM start_time) = 2013
)
WHEN t.year = 2014 THEN (
SELECT
COUNT(DISTINCT station_id)
FROM
`bigquery-public-data.austin_bikeshare.bikeshare_trips` t
INNER JOIN
`bigquery-public-data.austin_bikeshare.bikeshare_stations` s
ON
t.start_station_id = s.station_id
WHERE
s.status = 'closed' AND EXTRACT(YEAR FROM start_time) = 2014
)
END
AS number_status_closed
FROM
(
SELECT
EXTRACT(YEAR FROM start_time) AS year,
start_station_id
FROM
`bigquery-public-data.austin_bikeshare.bikeshare_trips`
)
AS t
INNER JOIN
`bigquery-public-data.austin_bikeshare.bikeshare_stations` s
ON
t.start_station_id = s.station_id
WHERE
t.year BETWEEN 2013 AND 2014
GROUP BY
t.year
ORDER BY
t.year
WITH UserAnswers AS (
SELECT
owner_user_id AS answer_owner_id,
COUNT(id) AS answer_count
FROM bigquery-public-data.stackoverflow.posts_answers
WHERE owner_user_id IS NOT NULL
GROUP BY owner_user_id
),
DetailedUsers AS (
SELECT
id AS user_id,
display_name AS user_display_name,
reputation
FROM bigquery-public-data.stackoverflow.users
WHERE display_name IS NOT NULL AND reputation > 10
),
RankedUsers AS (
SELECT
u.user_display_name,
u.reputation,
a.answer_count,
ROW_NUMBER() OVER (ORDER BY a.answer_count DESC) AS rank
FROM DetailedUsers u
JOIN UserAnswers a ON u.user_id = a.answer_owner_id
)
SELECT
user_display_name,
FROM RankedUsers
WHERE rank = 1;
SELECT
COUNT(1) AS num_rides
FROM
`bigquery-public-data.austin_bikeshare.bikeshare_trips`
WHERE
start_station_name
NOT IN ('Mobile Station', 'Repair Shop')
AND
end_station_name
NOT IN ('Mobile Station', 'Repair Shop')
AND
subscriber_type = 'Student Membership'
AND
bike_type = 'electric'
AND
duration_minutes > 10
GROUP BY
EXTRACT(YEAR from start_time),
EXTRACT(MONTH from start_time),
EXTRACT(DAY from start_time)
ORDER BY num_rides DESC
LIMIT 1
SELECT
district
FROM (
SELECT
S.starting_district AS district,
T.start_station_id,
T.end_station_id
FROM
`bigquery-public-data.austin_bikeshare.bikeshare_trips` AS T
INNER JOIN (
SELECT
station_id,
council_district AS starting_district
FROM
`bigquery-public-data.austin_bikeshare.bikeshare_stations`
WHERE
status = "active"
) AS S ON T.start_station_id = S.station_id
WHERE
S.starting_district IN (
SELECT council_district
FROM `bigquery-public-data.austin_bikeshare.bikeshare_stations`
WHERE
status = "active" AND
station_id = SAFE_CAST(T.end_station_id AS INT64)
)
AND T.start_station_id != SAFE_CAST(T.end_station_id AS INT64)
)
GROUP BY district
ORDER BY COUNT(*) DESC
LIMIT 1;
SELECT
category,
COUNT(*) AS number_total_by_category,
CASE
WHEN category = 'tech' THEN
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE (LOWER(body) LIKE '%education%') AND category = 'tech') * 100 /
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE category = 'tech')
WHEN category = 'sport' THEN
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE (LOWER(body) LIKE '%education%') AND category = 'sport') * 100 /
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE category = 'sport')
WHEN category = 'business' THEN
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE (LOWER(body) LIKE '%education%') AND category = 'business') * 100 /
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE category = 'business')
WHEN category = 'politics' THEN
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE (LOWER(body) LIKE '%education%') AND category = 'politics') * 100 /
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE category = 'politics')
WHEN category = 'entertainment' THEN
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE (LOWER(body) LIKE '%education%') AND category = 'entertainment') * 100 /
(SELECT count(*)
FROM `bigquery-public-data.bbc_news.fulltext`
WHERE category = 'entertainment')
END AS percent_education
FROM `bigquery-public-data.bbc_news.fulltext`
GROUP BY
category;
with _fips AS
(
SELECT
state_fips_code
FROM
`bigquery-public-data.census_utility.fips_codes_states`
WHERE
state_name = "Florida"
)
,_zip AS
(
SELECT
z.zip_code,
z.zip_code_geom,
FROM
`bigquery-public-data.geo_us_boundaries.zip_codes` z, _fips u
WHERE
z.state_fips_code = u.state_fips_code
)
,locations AS
(
SELECT
COUNT(i.institution_name) AS count_locations,
l.zip_code
FROM
`bigquery-public-data.fdic_banks.institutions` i
JOIN
`bigquery-public-data.fdic_banks.locations` l
USING (fdic_certificate_number)
WHERE
l.state IS NOT NULL
AND
l.state_name IS NOT NULL
GROUP BY 2
)
SELECT
z.zip_code
FROM
_zip z
JOIN
locations l
USING (zip_code)
GROUP BY
z.zip_code
ORDER BY
SUM(l.count_locations) DESC
LIMIT 1;
SELECT
a.name AS name
FROM
`bigquery-public-data.usa_names.usa_1910_current` a
JOIN (
SELECT
name,
gender,
year,
SUM(number) AS total_number
FROM
`bigquery-public-data.usa_names.usa_1910_current`
GROUP BY
name,
gender,
year) b
ON
a.name = b.name
AND a.gender = b.gender
AND a.year = b.year
WHERE
a.gender = 'F' AND
a.state = 'WY' AND
a.year = 2021
ORDER BY (a.number / b.total_number) DESC
LIMIT 1
with
stations_selected as (
select
usaf,
wban,
country,
name
from
`bigquery-public-data.noaa_gsod.stations`
where
country in ('US', 'UK')
),
data_filtered as (
select
gsod.*,
stations.country
from
`bigquery-public-data.noaa_gsod.gsod2023` gsod
join
stations_selected stations
on
gsod.stn = stations.usaf
and gsod.wban = stations.wban
where
date(gsod.date) between '2023-10-01' and '2023-10-31'
and gsod.temp != 9999.9
),
-- US Metrics
us_metrics as (
select
date(date) as metric_date,
avg(temp) as avg_temp_us,
min(temp) as min_temp_us,
max(temp) as max_temp_us
from
data_filtered
where
country = 'US'
group by
metric_date
),
-- UK Metrics
uk_metrics as (
select
date(date) as metric_date,
avg(temp) as avg_temp_uk,
min(temp) as min_temp_uk,
max(temp) as max_temp_uk
from
data_filtered
where
country = 'UK'
group by
metric_date
),
-- Temperature Differences
temp_differences as (
select
us.metric_date,
us.max_temp_us - uk.max_temp_uk as max_temp_diff,
us.min_temp_us - uk.min_temp_uk as min_temp_diff,
us.avg_temp_us - uk.avg_temp_uk as avg_temp_diff
from
us_metrics us
join
uk_metrics uk
on
us.metric_date = uk.metric_date
)
select
metric_date,
max_temp_diff,
min_temp_diff,
avg_temp_diff
from
temp_differences
order by
metric_date;
WITH
python2_questions AS (
SELECT
q.id AS question_id,
q.title,
q.body AS question_body,
q.tags
FROM
`bigquery-public-data.stackoverflow.posts_questions` q
WHERE
(LOWER(q.tags) LIKE '%python-2%'
OR LOWER(q.tags) LIKE '%python-2.x%'
OR (
LOWER(q.title) LIKE '%python 2%'
OR LOWER(q.body) LIKE '%python 2%'
OR LOWER(q.title) LIKE '%python2%'
OR LOWER(q.body) LIKE '%python2%'
))
AND (
LOWER(q.title) NOT LIKE '%python 3%'
AND LOWER(q.body) NOT LIKE '%python 3%'
AND LOWER(q.title) NOT LIKE '%python3%'
AND LOWER(q.body) NOT LIKE '%python3%'
)
)
SELECT
COUNT(*) AS count_number
FROM
python2_questions q
LEFT JOIN
`bigquery-public-data.stackoverflow.posts_answers` a
ON
q.question_id = a.parent_id
GROUP BY q.question_id
ORDER BY count_number DESC
LIMIT 1
SELECT
answer.id AS a_id,
(SELECT users.reputation FROM `bigquery-public-data.stackoverflow.users` users
WHERE users.id = answer.owner_user_id) AS a_user_reputation,
answer.score AS a_score,
answer.comment_count AS answer_comment_count,
questions.tags as q_tags,
questions.score AS q_score,
questions.answer_count AS answer_count,
(SELECT users.reputation FROM `bigquery-public-data.stackoverflow.users` users
WHERE users.id = questions.owner_user_id) AS q_user_reputation,
questions.view_count AS q_view_count,
questions.comment_count AS q_comment_count
FROM
`bigquery-public-data.stackoverflow.posts_answers` AS answer
LEFT JOIN
`bigquery-public-data.stackoverflow.posts_questions` AS questions
ON answer.parent_id = questions.id
WHERE
answer.id = questions.accepted_answer_id
AND
(
questions.tags LIKE '%javascript%' AND
(questions.tags LIKE '%xss%' OR
questions.tags LIKE '%cross-site%' OR
questions.tags LIKE '%exploit%' OR
questions.tags LIKE '%cybersecurity%')
)
AND DATE(questions.creation_date) BETWEEN '2016-01-01' AND '2016-01-31'
AND DATE(answer.creation_date) BETWEEN '2016-01-01' AND '2016-01-31'
WITH
-- Get recent data
RecentData AS (
SELECT
FORMAT_TIMESTAMP('%Y%m', creation_date) AS month_index,
tags
FROM
`bigquery-public-data.stackoverflow.posts_questions`
WHERE
EXTRACT(YEAR FROM DATE(creation_date)) = 2022
),
-- Monthly number of questions posted
MonthlyQuestions AS (
SELECT
month_index,
COUNT(*) AS num_questions
FROM
RecentData
GROUP BY
month_index
),
-- Monthly number of questions posted with specific tags
TaggedQuestions AS (
SELECT
month_index,
tag,
COUNT(*) AS num_tags
FROM
RecentData,
UNNEST(SPLIT(tags, '|')) AS tag
WHERE
tag IN ('python')
GROUP BY
month_index, tag
)
SELECT
a.month_index,
a.num_tags / b.num_questions AS proportion
FROM
TaggedQuestions a
LEFT JOIN
MonthlyQuestions b ON a.month_index = b.month_index
ORDER BY
a.month_index, proportion DESC;
SELECT u_id, tags
FROM (
-- select comments with tags from the post
SELECT cm.u_id, cm.creation_date, cm.text, pq.tags, "comment" as type
FROM (
SELECT a.parent_id as q_id, c.user_id as u_id, c.creation_date as creation_date, c.text as text
FROM `bigquery-public-data.stackoverflow.comments` as c
INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` as a ON (a.id = c.post_id)
WHERE c.user_id BETWEEN 16712208 AND 18712208
AND DATE(c.creation_date) BETWEEN '2019-07-01' AND '2019-12-31'
UNION ALL
SELECT q.id as q_id, c.user_id as u_id, c.creation_date as creation_date, c.text as text
FROM `bigquery-public-data.stackoverflow.comments` as c
INNER JOIN `bigquery-public-data.stackoverflow.posts_questions` as q ON (q.id = c.post_id)
WHERE c.user_id BETWEEN 16712208 AND 18712208
AND DATE(c.creation_date) BETWEEN '2019-07-01' AND '2019-12-31'
) as cm
INNER JOIN `bigquery-public-data.stackoverflow.posts_questions` as pq ON (pq.id = cm.q_id)
UNION ALL
-- select answers with tags related to the post
SELECT pa.owner_user_id as u_id, pa.creation_date as creation_date, pa.body as text, pq.tags as tags, "answer" as type
FROM `bigquery-public-data.stackoverflow.posts_answers` as pa
LEFT OUTER JOIN `bigquery-public-data.stackoverflow.posts_questions` as pq ON pq.id = pa.parent_id
WHERE pa.owner_user_id BETWEEN 16712208 AND 18712208
AND DATE(pa.creation_date) BETWEEN '2019-07-01' AND '2019-12-31'
UNION ALL
-- select posts
SELECT pq.owner_user_id as u_id, pq.creation_date as creation_date, pq.body as text, pq.tags as tags, "question" as type
FROM `bigquery-public-data.stackoverflow.posts_questions` as pq
WHERE pq.owner_user_id BETWEEN 16712208 AND 18712208
AND DATE(pq.creation_date) BETWEEN '2019-07-01' AND '2019-12-31'
)
ORDER BY u_id, creation_date;
WITH
tags_to_use AS (
SELECT tag, idx
FROM UNNEST([
'android-layout',
'android-activity',
'android-intent',
'android-edittext',
'android-fragments',
'android-recyclerview',
'listview',
'android-actionbar',
'google-maps',
'android-asynctask'
]) AS tag WITH OFFSET idx
),
android_how_to_questions AS (
SELECT
PQ.*
FROM
bigquery-public-data.stackoverflow.posts_questions PQ
WHERE
EXISTS (
SELECT 1
FROM UNNEST(SPLIT(PQ.tags, '|')) tag
WHERE tag IN (SELECT tag FROM tags_to_use)
)
AND (LOWER(PQ.title) LIKE '%how%' OR LOWER(PQ.body) LIKE '%how%')
AND NOT (LOWER(PQ.title) LIKE '%fail%' OR LOWER(PQ.title) LIKE '%problem%' OR LOWER(PQ.title) LIKE '%error%'
OR LOWER(PQ.title) LIKE '%wrong%' OR LOWER(PQ.title) LIKE '%fix%' OR LOWER(PQ.title) LIKE '%bug%'
OR LOWER(PQ.title) LIKE '%issue%' OR LOWER(PQ.title) LIKE '%solve%' OR LOWER(PQ.title) LIKE '%trouble%')
AND NOT (LOWER(PQ.body) LIKE '%fail%' OR LOWER(PQ.body) LIKE '%problem%' OR LOWER(PQ.body) LIKE '%error%'
OR LOWER(PQ.body) LIKE '%wrong%' OR LOWER(PQ.body) LIKE '%fix%' OR LOWER(PQ.body) LIKE '%bug%'
OR LOWER(PQ.body) LIKE '%issue%' OR LOWER(PQ.body) LIKE '%solve%' OR LOWER(PQ.body) LIKE '%trouble%')
),
questions_with_tag_rankings AS (
SELECT
T.id AS tag_id,
TTU.idx AS tag_offset,
T.tag_name,
T.wiki_post_id AS tag_wiki_post_id,
Q.id AS question_id,
Q.title,
Q.tags,
Q.view_count,
RANK() OVER (PARTITION BY T.id ORDER BY Q.view_count DESC) AS question_view_count_rank,
COUNT(*) OVER (PARTITION BY T.id) AS total_valid_questions
FROM
bigquery-public-data.stackoverflow.tags T
INNER JOIN
tags_to_use TTU ON T.tag_name = TTU.tag
INNER JOIN
android_how_to_questions Q ON T.tag_name IN UNNEST(SPLIT(Q.tags, '|'))
)
SELECT
question_id
FROM
questions_with_tag_rankings
WHERE
question_view_count_rank <= 50 AND total_valid_questions >= 50
ORDER BY
tag_offset ASC, question_view_count_rank ASC;
SELECT
Day_of_Week,
COUNT(1) AS Num_Questions,
SUM(answered_in_1h) AS Num_Answered_in_1H,
ROUND(100 * SUM(answered_in_1h) / COUNT(1),1) AS Percent_Answered_in_1H
FROM
(
SELECT
q.id AS question_id,
EXTRACT(DAYOFWEEK FROM q.creation_date) AS day_of_week,
MAX(IF(a.parent_id IS NOT NULL AND
(UNIX_SECONDS(a.creation_date)-UNIX_SECONDS(q.creation_date))/(60*60) <= 1, 1, 0)) AS answered_in_1h
FROM
`bigquery-public-data.stackoverflow.posts_questions` q
LEFT JOIN
`bigquery-public-data.stackoverflow.posts_answers` a
ON q.id = a.parent_id
WHERE EXTRACT(YEAR FROM a.creation_date) = 2020
AND EXTRACT(YEAR FROM q.creation_date) = 2020
GROUP BY question_id, day_of_week
)
GROUP BY
Day_of_Week
ORDER BY
Day_of_Week;
WITH badge_counts AS (
SELECT
c.id,
COUNT(DISTINCT d.id) AS badge_number
FROM
`bigquery-public-data.stackoverflow.users` AS c
JOIN
`bigquery-public-data.stackoverflow.badges` AS d
ON
c.id = d.user_id
GROUP BY
c.id
),
labeled_questions AS (
SELECT
a.id,
IF(
a.id IN (
SELECT DISTINCT b.id
FROM
`bigquery-public-data.stackoverflow.posts_answers` AS a
JOIN
`bigquery-public-data.stackoverflow.posts_questions` AS b
ON
a.parent_id = b.id
WHERE
b.accepted_answer_id IS NULL
AND a.score / b.view_count > 0.01
) OR accepted_answer_id IS NOT NULL,
1,
0
) AS label,
a.owner_user_id,
LENGTH(a.body) AS body_length
FROM
`bigquery-public-data.stackoverflow.posts_questions` AS a
)
SELECT
lq.id,
b.reputation,
b.up_votes - b.down_votes AS net_votes,
e.badge_number
FROM
labeled_questions AS lq
JOIN
`bigquery-public-data.stackoverflow.users` AS b
ON
lq.owner_user_id = b.id
JOIN
badge_counts AS e
ON
b.id = e.id
WHERE
lq.label = 1
ORDER BY
lq.body_length DESC
LIMIT
10;
WITH
tags_to_use AS (
SELECT tag, idx
FROM UNNEST([
'android-layout',
'android-activity',
'android-intent',
'android-edittext',
'android-fragments',
'android-recyclerview',
'listview',
'android-actionbar',
'google-maps',
'android-asynctask'
]) AS tag WITH OFFSET idx
),
android_how_to_questions AS (
SELECT
PQ.*
FROM
`bigquery-public-data.stackoverflow.posts_questions` PQ
WHERE
EXISTS (
SELECT 1
FROM UNNEST(SPLIT(PQ.tags, '|')) tag
WHERE tag IN (SELECT tag FROM tags_to_use)
)
AND (LOWER(PQ.title) LIKE '%how%' OR LOWER(PQ.body) LIKE '%how%')
),
most_viewed_question AS (
SELECT
T.id AS tag_id,
T.tag_name,
Q.id AS question_id,
Q.title,
Q.tags,
Q.view_count
FROM
`bigquery-public-data.stackoverflow.tags` T
INNER JOIN
tags_to_use TTU ON T.tag_name = TTU.tag
INNER JOIN
android_how_to_questions Q ON T.tag_name IN UNNEST(SPLIT(Q.tags, '|'))
ORDER BY Q.view_count DESC
LIMIT 1
)
SELECT
title
FROM
most_viewed_question;
WITH russia_Data AS (
SELECT DISTINCT
id.country_name,
id.value, -- Format in DataStudio
id.indicator_name
FROM (
SELECT
country_code,
region
FROM
bigquery-public-data.world_bank_intl_debt.country_summary
WHERE
region != "" -- Aggregated countries do not have a region
) cs -- Aggregated countries do not have a region
INNER JOIN (
SELECT
country_code,
country_name,
value,
indicator_name
FROM
bigquery-public-data.world_bank_intl_debt.international_debt
WHERE
country_code = 'RUS'
) id
ON
cs.country_code = id.country_code
WHERE value IS NOT NULL
)
-- Count the number of indicators with a value of 0 for Russia
SELECT
COUNT(*) AS number_of_indicators_with_zero
FROM
russia_Data
WHERE
value = 0;
WITH country_data AS (
-- CTE for country descriptive data
SELECT
country_code,
short_name AS country,
region,
income_group
FROM
`bigquery-public-data.world_bank_wdi.country_summary`
),
gdp_data AS (
-- Filter data to only include GDP values
SELECT
data.country_code,
country,
region,
value AS gdp_value
FROM
`bigquery-public-data.world_bank_wdi.indicators_data` data
LEFT JOIN country_data
ON data.country_code = country_data.country_code
WHERE indicator_code = "NY.GDP.MKTP.KD" -- GDP Indicator
AND country_data.region IS NOT NULL
AND country_data.income_group IS NOT NULL
),
cal_median_gdp AS (
-- Calculate the median GDP value for each region
SELECT
region,
APPROX_QUANTILES(gdp_value, 2)[OFFSET(1)] AS median_gdp
FROM gdp_data
GROUP BY region
)
-- Select the regions with their median GDP values
SELECT
region
FROM
cal_median_gdp
ORDER BY median_gdp DESC
LIMIT 1;
WITH _fips AS (
SELECT
state_fips_code
FROM
`bigquery-public-data.census_utility.fips_codes_states`
WHERE
state_name = "Colorado"
),
_bg AS (
SELECT
b.geo_id,
b.blockgroup_geom,
ST_AREA(b.blockgroup_geom) AS bg_size
FROM
`bigquery-public-data.geo_census_blockgroups.us_blockgroups_national` b
JOIN
_fips u ON b.state_fips_code = u.state_fips_code
),
_zip AS (
SELECT
z.zip_code,
z.zip_code_geom
FROM
`bigquery-public-data.geo_us_boundaries.zip_codes` z
JOIN
_fips u ON z.state_fips_code = u.state_fips_code
),
bq_zip_overlap AS (
SELECT
b.geo_id,
z.zip_code,
ST_AREA(ST_INTERSECTION(b.blockgroup_geom, z.zip_code_geom)) / b.bg_size AS overlap_size,
b.blockgroup_geom
FROM
_zip z
JOIN
_bg b ON ST_INTERSECTS(b.blockgroup_geom, z.zip_code_geom)
),
locations AS (
SELECT
SUM(overlap_size * count_locations) AS locations_per_bg,
l.zip_code
FROM (
SELECT
COUNT(CONCAT(institution_name, " : ", branch_name)) AS count_locations,
zip_code
FROM
`bigquery-public-data.fdic_banks.locations`
WHERE
state IS NOT NULL
AND state_name IS NOT NULL
GROUP BY
zip_code
) l
JOIN
bq_zip_overlap ON l.zip_code = bq_zip_overlap.zip_code
GROUP BY
l.zip_code
)
SELECT
l.zip_code
FROM
locations l
GROUP BY
l.zip_code
ORDER BY
MAX(locations_per_bg) DESC
LIMIT 1;
WITH population_change AS (
SELECT
a.geo_id,
a.total_pop AS pop_2011,
b.total_pop AS pop_2018,
((b.total_pop - a.total_pop) / a.total_pop) * 100 AS population_change_percentage
FROM
bigquery-public-data.census_bureau_acs.censustract_2011_5yr a
JOIN
bigquery-public-data.census_bureau_acs.censustract_2018_5yr b
ON
a.geo_id = b.geo_id
WHERE
a.total_pop > 1000
AND b.total_pop > 1000
AND a.geo_id LIKE '36047%'
AND b.geo_id LIKE '36047%'
ORDER BY
population_change_percentage DESC
LIMIT 20
),
acs_2018 AS (
SELECT
geo_id,
median_income AS median_income_2018
FROM
bigquery-public-data.census_bureau_acs.censustract_2018_5yr
WHERE
geo_id LIKE '36047%'
AND total_pop > 1000
),
acs_2011 AS (
SELECT
geo_id,
median_income AS median_income_2011
FROM
bigquery-public-data.census_bureau_acs.censustract_2011_5yr
WHERE
geo_id LIKE '36047%'
AND total_pop > 1000
),
acs_diff AS (
SELECT
a18.geo_id,
a18.median_income_2018,
a11.median_income_2011,
(a18.median_income_2018 - a11.median_income_2011) AS median_income_diff
FROM
acs_2018 a18
JOIN
acs_2011 a11
ON a18.geo_id = a11.geo_id
WHERE
(a18.median_income_2018 - a11.median_income_2011) IS NOT NULL
ORDER BY
(a18.median_income_2018 - a11.median_income_2011) DESC
LIMIT 20
),
common_geoids AS (
SELECT population_change.geo_id
FROM population_change
JOIN acs_diff ON population_change.geo_id = acs_diff.geo_id
)
SELECT geo_id FROM common_geoids;
WITH monthly_totals AS (
SELECT
SUM(CASE WHEN subscriber_type = 'Customer' THEN duration_sec / 60 ELSE NULL END) AS customer_minutes_sum,
SUM(CASE WHEN subscriber_type = 'Subscriber' THEN duration_sec / 60 ELSE NULL END) AS subscriber_minutes_sum,
EXTRACT(MONTH FROM end_date) AS end_month
FROM
`bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`
WHERE
EXTRACT(YEAR FROM end_date) = 2017
GROUP BY
end_month
),
cumulative_totals AS (
SELECT
end_month,
SUM(customer_minutes_sum) OVER (ORDER BY end_month ROWS UNBOUNDED PRECEDING) / 1000 AS cumulative_minutes_cust,
SUM(subscriber_minutes_sum) OVER (ORDER BY end_month ROWS UNBOUNDED PRECEDING) / 1000 AS cumulative_minutes_sub
FROM
monthly_totals
),
differences AS (
SELECT
end_month,
ABS(cumulative_minutes_cust - cumulative_minutes_sub) AS abs_diff
FROM
cumulative_totals
)
SELECT
end_month
FROM
differences
ORDER BY
abs_diff DESC
LIMIT 1;
DECLARE
my_drug_list ARRAY<STRING>;
SET
my_drug_list = [
'Keytruda',
'Vioxx',
'Humira',
'Premarin' ];
SELECT
id AS drug_id,
tradeNameList.element AS drug_trade_name,
drugType AS drug_type,
hasBeenWithdrawn AS drug_withdrawn
FROM
`open-targets-prod.platform.molecule`,
UNNEST (tradeNames.list) AS tradeNameList
WHERE
tradeNameList.element IN UNNEST(my_drug_list)
AND isApproved = TRUE
AND blackBoxWarning = TRUE
AND drugType != 'Unknown';
WITH natality_2018 AS (
SELECT County_of_Residence_FIPS AS FIPS, Ave_Number_of_Prenatal_Wks AS Vist_Ave, County_of_Residence
FROM `bigquery-public-data.sdoh_cdc_wonder_natality.county_natality`
WHERE SUBSTR(County_of_Residence_FIPS, 0, 2) = "55" AND Year = '2018-01-01'
),
acs_2017 AS (
SELECT geo_id, commute_45_59_mins, employed_pop
FROM `bigquery-public-data.census_bureau_acs.county_2017_5yr`
),
corr_tbl AS (
SELECT
n.County_of_Residence,
ROUND((a.commute_45_59_mins / a.employed_pop) * 100, 2) AS percent_high_travel,
n.Vist_Ave
FROM acs_2017 a
JOIN natality_2018 n
ON a.geo_id = n.FIPS
)
SELECT County_of_Residence, Vist_Ave
FROM corr_tbl
WHERE percent_high_travel > 5
WITH skin_condition_ICD_concept_ids AS (
SELECT
concept_id,
CASE concept_code
WHEN 'L70' THEN 'Acne'
WHEN 'L20' THEN 'Atopic dermatitis'
WHEN 'L40' THEN 'Psoriasis'
ELSE 'Vitiligo'
END AS skin_condition
FROM
`bigquery-public-data.cms_synthetic_patient_data_omop.concept`
WHERE
concept_code IN ('L70', 'L20', 'L40', 'L80')
AND vocabulary_id = 'ICD10CM'
),
standard_concept_ids AS (
SELECT
concept_id
FROM
`bigquery-public-data.cms_synthetic_patient_data_omop.concept`
WHERE
standard_concept = 'S'
),
skin_condition_standard_concept_ids AS (
SELECT
s.skin_condition,
r.concept_id_2 AS concept_id
FROM
skin_condition_ICD_concept_ids s
JOIN
`bigquery-public-data.cms_synthetic_patient_data_omop.concept_relationship` r
ON
s.concept_id = r.concept_id_1
JOIN
standard_concept_ids sc
ON
sc.concept_id = r.concept_id_2
WHERE
r.relationship_id = 'Maps to'
),
all_skin_concept_ids AS (
SELECT DISTINCT
skin_condition,
concept_id
FROM
skin_condition_standard_concept_ids
),
descendant_concept_ids AS (
SELECT
a.skin_condition,
ca.descendant_concept_id AS concept_id
FROM
all_skin_concept_ids a
JOIN
`bigquery-public-data.cms_synthetic_patient_data_omop.concept_ancestor` ca
ON
a.concept_id = ca.ancestor_concept_id
),
participants_with_condition AS (
SELECT
d.skin_condition,
COUNT(DISTINCT co.person_id) AS nb_of_participants_with_skin_condition
FROM
`bigquery-public-data.cms_synthetic_patient_data_omop.condition_occurrence` co
JOIN
descendant_concept_ids d
ON
co.condition_concept_id = d.concept_id
GROUP BY
d.skin_condition
),
total_participants AS (
SELECT
COUNT(DISTINCT person_id) AS nb_of_participants
FROM
`bigquery-public-data.cms_synthetic_patient_data_omop.person`
)
SELECT
p.skin_condition,
100 * p.nb_of_participants_with_skin_condition / t.nb_of_participants AS percentage_of_participants
FROM
participants_with_condition p,
total_participants t
WITH quinapril_concept AS (
SELECT concept_id
FROM `bigquery-public-data.cms_synthetic_patient_data_omop.concept`
WHERE concept_code = "35208" AND vocabulary_id = "RxNorm"
),
quinapril_related_medications AS (
SELECT DISTINCT descendant_concept_id AS concept_id
FROM `bigquery-public-data.cms_synthetic_patient_data_omop.concept_ancestor`
WHERE ancestor_concept_id IN (SELECT concept_id FROM quinapril_concept)
),
participants_with_quinapril AS (
SELECT COUNT(DISTINCT person_id) AS count
FROM `bigquery-public-data.cms_synthetic_patient_data_omop.drug_exposure`
WHERE drug_concept_id IN (SELECT concept_id FROM quinapril_related_medications)
),
total_participants AS (
SELECT COUNT(DISTINCT person_id) AS count
FROM `bigquery-public-data.cms_synthetic_patient_data_omop.person`
)
SELECT
100 - (100 * participants_with_quinapril.count / total_participants.count) AS without_quinapril
FROM
participants_with_quinapril, total_participants
WITH DailyAverages AS (
SELECT
year, month, day, latitude, longitude,
AVG(wind_speed) AS avg_wind_speed,
FROM
`bigquery-public-data.noaa_icoads.icoads_core_*`
WHERE
_TABLE_SUFFIX BETWEEN '2005' AND '2015'
GROUP BY
year, month, day, latitude, longitude
)
SELECT
year, month, day, latitude, longitude,
avg_wind_speed,
FROM
DailyAverages
WHERE
avg_wind_speed IS NOT NULL
ORDER BY avg_wind_speed DESC LIMIT 5
WITH specialist_counts AS (
SELECT
healthcare_provider_taxonomy_1_specialization,
COUNT(DISTINCT npi) AS number_specialist
FROM
`bigquery-public-data.nppes.npi_optimized`
WHERE
provider_business_practice_location_address_city_name = "MOUNTAIN VIEW"
AND provider_business_practice_location_address_state_name = "CA"
AND healthcare_provider_taxonomy_1_specialization > ""
GROUP BY
healthcare_provider_taxonomy_1_specialization
),
top_10_specialists AS (
SELECT
healthcare_provider_taxonomy_1_specialization,
number_specialist
FROM
specialist_counts
ORDER BY
number_specialist DESC
LIMIT 10
),
average_value AS (
SELECT
AVG(number_specialist) AS average_specialist
FROM
top_10_specialists
),
closest_to_average AS (
SELECT
healthcare_provider_taxonomy_1_specialization,
number_specialist,
ABS(number_specialist - (SELECT average_specialist FROM average_value)) AS difference
FROM
top_10_specialists
)
SELECT
healthcare_provider_taxonomy_1_specialization
FROM
closest_to_average
ORDER BY
difference
LIMIT 1;
select company from
(select *,
row_number() over(partition by company order by month_o_month_calc desc) as rownum
from
(select *,
num_trips - lag(num_trips) over(partition by company order by month) as month_o_month_calc
from
(SELECT
company,
format_date("%Y-%m", date_sub((cast(trip_start_timestamp as date)), interval 1 month)) as prev_month,
format_date("%Y-%m", cast(trip_start_timestamp as date)) AS month,
count(1) AS num_trips
from `bigquery-public-data.chicago_taxi_trips.taxi_trips`
where extract(YEAR from trip_start_timestamp) = 2018
group by company, month, prev_month
order by company,month)
order by company, month_o_month_calc desc)
)
where rownum = 1
order by month_o_month_calc desc, company
limit 3
SELECT
FORMAT('%02.0fm to %02.0fm', min_minutes, max_minutes) AS minutes_range,
SUM(trips) AS total_trips,
FORMAT('%3.2f', SUM(total_fare) / SUM(trips)) AS average_fare
FROM (
SELECT
MIN(duration_in_minutes) OVER (quantiles) AS min_minutes,
MAX(duration_in_minutes) OVER (quantiles) AS max_minutes,
SUM(trips) AS trips,
SUM(total_fare) AS total_fare
FROM (
SELECT
ROUND(trip_seconds / 60) AS duration_in_minutes,
NTILE(10) OVER (ORDER BY trip_seconds / 60) AS quantile,
COUNT(1) AS trips,
SUM(fare) AS total_fare
FROM
`bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE
ROUND(trip_seconds / 60) BETWEEN 1 AND 50
GROUP BY
trip_seconds,
duration_in_minutes )
GROUP BY
duration_in_minutes,
quantile
WINDOW quantiles AS (PARTITION BY quantile)
)
GROUP BY
minutes_range
ORDER BY
Minutes_range
SELECT period, description, c FROM (
SELECT
a.period,
b.description,
count(*) c,
row_number() over (partition by period order by count(*) desc) seqnum
FROM `bigquery-public-data.the_met.objects` a
JOIN (
SELECT
label.description as description,
object_id
FROM `bigquery-public-data.the_met.vision_api_data`, UNNEST(labelAnnotations) label
) b
ON a.object_id = b.object_id
WHERE a.period is not null
group by 1,2
)
WHERE seqnum <= 3
AND c >= 500 # only include labels that have 50 or more pieces associated with it
ORDER BY period, c desc;
WITH initial_visits AS (
SELECT
fullVisitorId,
MIN(visitStartTime) AS initialVisitStartTime
FROM
`bigquery-public-data.google_analytics_sample.*`
WHERE
totals.newVisits = 1
AND date BETWEEN '20160801' AND '20170430'
GROUP BY
fullVisitorId
),
qualified_initial_visits AS (
SELECT
s.fullVisitorId,
s.visitStartTime AS initialVisitStartTime,
s.totals.timeOnSite AS time_on_site
FROM
`bigquery-public-data.google_analytics_sample.*` s
JOIN initial_visits i
ON s.fullVisitorId = i.fullVisitorId
AND s.visitStartTime = i.initialVisitStartTime
WHERE
s.totals.timeOnSite > 300
),
filtered_data AS (
SELECT
q.fullVisitorId,
q.time_on_site,
IF(COUNTIF(s.visitStartTime > q.initialVisitStartTime AND s.totals.transactions > 0) > 0, 1, 0) AS will_buy_on_return_visit
FROM
qualified_initial_visits q
LEFT JOIN `bigquery-public-data.google_analytics_sample.*` s
ON q.fullVisitorId = s.fullVisitorId
GROUP BY
q.fullVisitorId, q.time_on_site
),
matching_users AS (
SELECT
fullVisitorId
FROM
filtered_data
WHERE
time_on_site > 300 AND will_buy_on_return_visit = 1
),
total_new_users AS (
SELECT
COUNT(DISTINCT fullVisitorId) AS total_new_users
FROM
`bigquery-public-data.google_analytics_sample.*`
WHERE
totals.newVisits = 1
AND date BETWEEN '20160801' AND '20170430'
),
final_counts AS (
SELECT
COUNT(DISTINCT fullVisitorId) AS users_matching_criteria
FROM
matching_users
)
SELECT
(final_counts.users_matching_criteria / total_new_users.total_new_users) * 100 AS percentage_matching_criteria
FROM
final_counts,
total_new_users;
WITH station_neighborhoods AS (
SELECT
bs.station_id,
bs.name AS station_name,
nb.neighborhood
FROM `bigquery-public-data.san_francisco.bikeshare_stations` bs
JOIN
bigquery-public-data.san_francisco_neighborhoods.boundaries nb
ON
ST_Intersects(ST_GeogPoint(bs.longitude, bs.latitude), nb.neighborhood_geom)
),
neighborhood_crime_counts AS (
SELECT
neighborhood,
COUNT(*) AS crime_count
FROM (
SELECT
n.neighborhood
FROM
bigquery-public-data.san_francisco.sfpd_incidents i
JOIN
bigquery-public-data.san_francisco_neighborhoods.boundaries n
ON
ST_Intersects(ST_GeogPoint(i.longitude, i.latitude), n.neighborhood_geom)
) AS incident_neighborhoods
GROUP BY
neighborhood
)
SELECT
sn.neighborhood,
COUNT(station_name) AS station_number,
ANY_VALUE(ncc.crime_count) AS crime_number
FROM
station_neighborhoods sn
JOIN
neighborhood_crime_counts ncc
ON
sn.neighborhood = ncc.neighborhood
GROUP BY sn.neighborhood
ORDER BY
crime_number ASC
WITH AvgScore AS (
SELECT
AVG(associations.score) AS avg_score
FROM
`open-targets-prod.platform.associationByOverallDirect` AS associations
JOIN
`open-targets-prod.platform.diseases` AS diseases
ON
associations.diseaseId = diseases.id
WHERE
diseases.name = 'psoriasis'
)
SELECT
targets.approvedSymbol AS target_approved_symbol
FROM
`open-targets-prod.platform.associationByOverallDirect` AS associations
JOIN
`open-targets-prod.platform.diseases` AS diseases
ON
associations.diseaseId = diseases.id
JOIN
`open-targets-prod.platform.targets` AS targets
ON
associations.targetId = targets.id
CROSS JOIN
AvgScore
WHERE
diseases.name = 'psoriasis'
ORDER BY
ABS(associations.score - AvgScore.avg_score) ASC
LIMIT 1
WITH data AS (
SELECT
EXTRACT(YEAR FROM wx.date) AS year,
MAX(IF(wx.element = 'PRCP', wx.value/10, NULL)) AS max_prcp,
MAX(IF(wx.element = 'TMIN', wx.value/10, NULL)) AS max_tmin,
MAX(IF(wx.element = 'TMAX', wx.value/10, NULL)) AS max_tmax
FROM
`bigquery-public-data.ghcn_d.ghcnd_2013` AS wx
WHERE
wx.id = 'USW00094846' AND
wx.qflag IS NULL AND
wx.value IS NOT NULL AND
DATE_DIFF(DATE('2013-12-31'), wx.date, DAY) < 15
GROUP BY
year
UNION ALL
SELECT
EXTRACT(YEAR FROM wx.date) AS year,
MAX(IF(wx.element = 'PRCP', wx.value/10, NULL)) AS max_prcp,
MAX(IF(wx.element = 'TMIN', wx.value/10, NULL)) AS max_tmin,
MAX(IF(wx.element = 'TMAX', wx.value/10, NULL)) AS max_tmax
FROM
`bigquery-public-data.ghcn_d.ghcnd_2014` AS wx
WHERE
wx.id = 'USW00094846' AND
wx.qflag IS NULL AND
wx.value IS NOT NULL AND
DATE_DIFF(DATE('2014-12-31'), wx.date, DAY) < 15
GROUP BY
year
UNION ALL
SELECT
EXTRACT(YEAR FROM wx.date) AS year,
MAX(IF(wx.element = 'PRCP', wx.value/10, NULL)) AS max_prcp,
MAX(IF(wx.element = 'TMIN', wx.value/10, NULL)) AS max_tmin,
MAX(IF(wx.element = 'TMAX', wx.value/10, NULL)) AS max_tmax
FROM
`bigquery-public-data.ghcn_d.ghcnd_2015` AS wx
WHERE
wx.id = 'USW00094846' AND
wx.qflag IS NULL AND
wx.value IS NOT NULL AND
DATE_DIFF(DATE('2015-12-31'), wx.date, DAY) < 15
GROUP BY
year
UNION ALL
SELECT
EXTRACT(YEAR FROM wx.date) AS year,
MAX(IF(wx.element = 'PRCP', wx.value/10, NULL)) AS max_prcp,
MAX(IF(wx.element = 'TMIN', wx.value/10, NULL)) AS max_tmin,
MAX(IF(wx.element = 'TMAX', wx.value/10, NULL)) AS max_tmax
FROM
`bigquery-public-data.ghcn_d.ghcnd_2016` AS wx
WHERE
wx.id = 'USW00094846' AND
wx.qflag IS NULL AND
wx.value IS NOT NULL AND
DATE_DIFF(DATE('2016-12-31'), wx.date, DAY) < 15
GROUP BY
year
)
SELECT
year,
MAX(max_prcp) AS annual_max_prcp,
MAX(max_tmin) AS annual_max_tmin,
MAX(max_tmax) AS annual_max_tmax
FROM data
GROUP BY year
ORDER BY year ASC;
SELECT
pm10.month AS month,
pm10.avg AS pm10,
pm25_frm.avg AS pm25_frm,
pm25_nonfrm.avg AS pm25_nonfrm,
co.avg AS co,
so2.avg AS so2,
lead.avg AS lead
FROM
(SELECT AVG(arithmetic_mean) AS avg,
EXTRACT(YEAR FROM date_local) AS year,
EXTRACT(MONTH FROM date_local) AS month
FROM `bigquery-public-data.epa_historical_air_quality.pm10_daily_summary`
WHERE state_name = 'California' AND EXTRACT(YEAR FROM date_local) = 2020
GROUP BY year, month) AS pm10
JOIN
(SELECT AVG(arithmetic_mean) AS avg,
EXTRACT(YEAR FROM date_local) AS year,
EXTRACT(MONTH FROM date_local) AS month
FROM `bigquery-public-data.epa_historical_air_quality.pm25_frm_daily_summary`
WHERE state_name = 'California' AND EXTRACT(YEAR FROM date_local) = 2020
GROUP BY year, month) AS pm25_frm
ON pm10.year = pm25_frm.year AND pm10.month = pm25_frm.month
JOIN
(SELECT AVG(arithmetic_mean) AS avg,
EXTRACT(YEAR FROM date_local) AS year,
EXTRACT(MONTH FROM date_local) AS month
FROM `bigquery-public-data.epa_historical_air_quality.pm25_nonfrm_daily_summary`
WHERE state_name = 'California' AND EXTRACT(YEAR FROM date_local) = 2020
GROUP BY year, month) AS pm25_nonfrm
ON pm10.year = pm25_nonfrm.year AND pm10.month = pm25_nonfrm.month
JOIN
(SELECT AVG(arithmetic_mean) * 100 AS avg,
EXTRACT(YEAR FROM date_local) AS year,
EXTRACT(MONTH FROM date_local) AS month
FROM `bigquery-public-data.epa_historical_air_quality.lead_daily_summary`
WHERE state_name = 'California' AND EXTRACT(YEAR FROM date_local) = 2020
GROUP BY year, month) AS lead
ON pm10.year = lead.year AND pm10.month = lead.month
JOIN
(SELECT AVG(arithmetic_mean) AS avg,
EXTRACT(YEAR FROM date_local) AS year,
EXTRACT(MONTH FROM date_local) AS month
FROM `bigquery-public-data.epa_historical_air_quality.voc_daily_summary`
WHERE state_name = 'California' AND EXTRACT(YEAR FROM date_local) = 2020
GROUP BY year, month) AS co
ON pm10.year = co.year AND pm10.month = co.month
JOIN
(SELECT AVG(arithmetic_mean) * 10 AS avg,
EXTRACT(YEAR FROM date_local) AS year,
EXTRACT(MONTH FROM date_local) AS month
FROM `bigquery-public-data.epa_historical_air_quality.so2_daily_summary`
WHERE state_name = 'California' AND EXTRACT(YEAR FROM date_local) = 2020
GROUP BY year, month) AS so2
ON pm10.year = so2.year AND pm10.month = so2.month
ORDER BY
month;
WITH
# FIRST CAST EACH YEAR, MONTH, DATE TO STRINGS
T AS (
SELECT
*,
CAST(year AS STRING) AS year_string,
CAST(mo AS STRING) AS month_string,
CAST(da AS STRING) AS day_string
FROM
`bigquery-public-data.noaa_gsod.gsod2009`
WHERE
stn = "723758"
),
# SECOND, CONCAT ALL THE STRINGS TOGETHER INTO ONE COLUMN
TT AS (
SELECT
*,
CONCAT(year_string, "-", month_string, "-", day_string) AS date_string
FROM
T
),
# THIRD, CAST THE DATE STRING INTO A DATE FORMAT
TTT AS (
SELECT
*,
CAST(date_string AS DATE) AS date_date
FROM
TT
),
# FOURTH, CALCULATE THE MEAN TEMPERATURE FOR EACH DATE
Temp_Avg AS (
SELECT
date_date,
AVG(temp) AS avg_temp
FROM
TTT
WHERE
date_date BETWEEN '2009-10-01' AND '2009-10-31'
GROUP BY
date_date
)
# FINAL SELECTION OF TOP 3 DATES WITH HIGHEST MEAN TEMPERATURE
SELECT
date_date AS dates
FROM
Temp_Avg
ORDER BY
avg_temp DESC
LIMIT 3;
WITH DailyAverages AS (
SELECT
year, month, day,
air_temperature,
wetbulb_temperature,
dewpoint_temperature,
sea_surface_temp
FROM
`bigquery-public-data.noaa_icoads.icoads_core_*`
WHERE
_TABLE_SUFFIX BETWEEN '2010' AND '2014'
),
MonthlyAverages AS (
SELECT
year,
month,
AVG(air_temperature) AS avg_air_temperature,
AVG(wetbulb_temperature) AS avg_wetbulb_temperature,
AVG(dewpoint_temperature) AS avg_dewpoint_temperature,
AVG(sea_surface_temp) AS avg_sea_surface_temp
FROM
DailyAverages
WHERE
air_temperature IS NOT NULL
AND wetbulb_temperature IS NOT NULL
AND dewpoint_temperature IS NOT NULL
AND sea_surface_temp IS NOT NULL
GROUP BY
year, month
),
DifferenceSums AS (
SELECT
year,
month,
(ABS(avg_air_temperature - avg_wetbulb_temperature) +
ABS(avg_air_temperature - avg_dewpoint_temperature) +
ABS(avg_air_temperature - avg_sea_surface_temp) +
ABS(avg_wetbulb_temperature - avg_dewpoint_temperature) +
ABS(avg_wetbulb_temperature - avg_sea_surface_temp) +
ABS(avg_dewpoint_temperature - avg_sea_surface_temp)) AS sum_of_differences
FROM
MonthlyAverages
)
SELECT
year,
month,
sum_of_differences
FROM
DifferenceSums
ORDER BY
sum_of_differences ASC
LIMIT 3;
WITH homeless_2015 AS (
SELECT Unsheltered_Homeless AS U15, SUBSTR(CoC_Number, 0, 2) as State_Abbr
FROM `bigquery-public-data.sdoh_hud_pit_homelessness.hud_pit_by_coc`
WHERE Count_Year = 2015
),
homeless_2018 AS (
SELECT Unsheltered_Homeless AS U18, SUBSTR(CoC_Number, 0, 2) as State_Abbr
FROM `bigquery-public-data.sdoh_hud_pit_homelessness.hud_pit_by_coc`
WHERE Count_Year = 2018
),
unsheltered_change AS (
SELECT homeless_2018.State_Abbr,
SUM(U15) AS Unsheltered_2015,
SUM(U18) AS Unsheltered_2018,
(SUM(U18) - SUM(U15)) / SUM(U15) * 100 AS Percent_Change
FROM homeless_2018
JOIN homeless_2015
ON homeless_2018.State_Abbr = homeless_2015.State_Abbr
GROUP BY State_Abbr
),
average_change AS (
SELECT AVG(Percent_Change) AS Avg_Change
FROM unsheltered_change
),
closest_to_avg AS (
SELECT State_Abbr
FROM unsheltered_change, average_change
ORDER BY ABS(Percent_Change - Avg_Change)
LIMIT 5
)
SELECT State_Abbr FROM closest_to_avg;
WITH weekend_accidents AS (
SELECT
state_name,
CASE
WHEN atmospheric_conditions_1_name = 'Rain' THEN 'Rain'
WHEN atmospheric_conditions_1_name = 'Clear' THEN 'Clear'
ELSE 'Other'
END AS Weather_Condition,
COUNT(DISTINCT consecutive_number) AS num_accidents
FROM
`bigquery-public-data.nhtsa_traffic_fatalities.accident_2016`
WHERE
EXTRACT(DAYOFWEEK FROM timestamp_of_crash) IN (1, 7) -- 1 = Sunday, 7 = Saturday
AND atmospheric_conditions_1_name IN ('Rain', 'Clear')
GROUP BY
state_name, Weather_Condition
),
weather_difference AS (
SELECT
state_name,
MAX(CASE WHEN Weather_Condition = 'Rain' THEN num_accidents ELSE 0 END) AS Rain_Accidents,
MAX(CASE WHEN Weather_Condition = 'Clear' THEN num_accidents ELSE 0 END) AS Clear_Accidents,
ABS(MAX(CASE WHEN Weather_Condition = 'Rain' THEN num_accidents ELSE 0 END) -
MAX(CASE WHEN Weather_Condition = 'Clear' THEN num_accidents ELSE 0 END)) AS Difference
FROM
weekend_accidents
GROUP BY
state_name
)
SELECT
state_name,
Difference
FROM
weather_difference
ORDER BY
Difference DESC
LIMIT 3;
WITH tmp AS (
SELECT DISTINCT *
FROM `data-to-insights.ecommerce.rev_transactions`
-- Removing duplicated values
),
tmp1 AS (
SELECT
tmp.channelGrouping,
tmp.geoNetwork_country,
SUM(tmp.totals_transactions) AS tt
FROM tmp
GROUP BY 1, 2
),
tmp2 AS (
SELECT
channelGrouping,
geoNetwork_country,
SUM(tt) AS TotalTransaction,
COUNT(DISTINCT geoNetwork_country) OVER (PARTITION BY channelGrouping) AS CountryCount
FROM tmp1
GROUP BY channelGrouping, geoNetwork_country
),
tmp3 AS (
SELECT
channelGrouping,
geoNetwork_country AS Country,
TotalTransaction,
RANK() OVER (PARTITION BY channelGrouping ORDER BY TotalTransaction DESC) AS rnk
FROM tmp2
WHERE CountryCount > 1
)
SELECT
channelGrouping,
Country,
TotalTransaction
FROM tmp3
WHERE rnk = 1;
WITH russia_Data as (
SELECT distinct
id.country_name,
id.value, --format in DataStudio
id.indicator_name
FROM (
SELECT
country_code,
region
FROM
bigquery-public-data.world_bank_intl_debt.country_summary
WHERE
region != "" ) cs --aggregated countries do not have a region
INNER JOIN (
SELECT
country_code,
country_name,
value,
indicator_name
FROM
bigquery-public-data.world_bank_intl_debt.international_debt
WHERE true
and country_code = 'RUS'
) id
ON
cs.country_code = id.country_code
WHERE value is not null
ORDER BY
id.value DESC
)
SELECT
indicator_name
FROM russia_data
LIMIT 3;
WITH country_data AS (
SELECT
country_code,
short_name AS country,
region,
income_group
FROM
bigquery-public-data.world_bank_wdi.country_summary
)
, birth_rate_data AS (
SELECT
data.country_code,
country_data.country,
country_data.region,
AVG(value) AS avg_birth_rate
FROM
bigquery-public-data.world_bank_wdi.indicators_data data
LEFT JOIN
country_data
ON
data.country_code = country_data.country_code
WHERE
indicator_code = "SP.DYN.CBRT.IN" -- Birth Rate
AND EXTRACT(YEAR FROM PARSE_DATE('%Y', CAST(year AS STRING))) BETWEEN 1980 AND 1989 -- 1980s
AND country_data.income_group = "High income" -- High-income group
GROUP BY
data.country_code,
country_data.country,
country_data.region
)
, ranked_birth_rates AS (
SELECT
region,
country,
avg_birth_rate,
RANK() OVER(PARTITION BY region ORDER BY avg_birth_rate DESC) AS rank
FROM
birth_rate_data
)
SELECT
region,
country,
avg_birth_rate
FROM
ranked_birth_rates
WHERE
rank = 1
ORDER BY
region;
WITH SelectedStops AS (
SELECT
stop_id,
stop_name
FROM
`bigquery-public-data.san_francisco_transit_muni.stops`
WHERE
stop_name IN ('Clay St & Drumm St', 'Sacramento St & Davis St')
),
FilteredStopTimes AS (
SELECT
st.trip_id,
st.stop_id,
st.arrival_time,
st.departure_time,
st.stop_sequence,
ss.stop_name
FROM
`bigquery-public-data.san_francisco_transit_muni.stop_times` st
JOIN
SelectedStops ss ON CAST(st.stop_id AS STRING) = ss.stop_id
)
SELECT
t.trip_headsign,
MIN(st1.departure_time) AS start_time,
MAX(st2.arrival_time) AS end_time
FROM
`bigquery-public-data.san_francisco_transit_muni.trips` t
JOIN FilteredStopTimes st1 ON t.trip_id = CAST(st1.trip_id AS STRING) AND st1.stop_name = 'Clay St & Drumm St'
JOIN FilteredStopTimes st2 ON t.trip_id = CAST(st2.trip_id AS STRING) AND st2.stop_name = 'Sacramento St & Davis St'
WHERE
st1.stop_sequence < st2.stop_sequence
GROUP BY
t.trip_headsign;
WITH visitors AS (
SELECT
COUNT(DISTINCT fullVisitorId) AS total_visitors
FROM
`data-to-insights.ecommerce.web_analytics`
),
purchasers AS (
SELECT
COUNT(DISTINCT fullVisitorId) AS total_purchasers
FROM
`data-to-insights.ecommerce.web_analytics`
WHERE
totals.transactions IS NOT NULL
),
transactions AS (
SELECT
COUNT(*) AS total_transactions,
AVG(totals.transactions) AS avg_transactions_per_purchaser
FROM
`data-to-insights.ecommerce.web_analytics`
WHERE
totals.transactions IS NOT NULL
)
SELECT
p.total_purchasers / v.total_visitors AS conversion_rate,
a.avg_transactions_per_purchaser AS avg_transactions_per_purchaser
FROM
visitors v,
purchasers p,
transactions a;
WITH RankedData AS (
SELECT
CONCAT("20", _TABLE_SUFFIX) AS year_filed,
totrevenue,
totfuncexpns,
ROW_NUMBER() OVER (PARTITION BY CONCAT("20", _TABLE_SUFFIX) ORDER BY totrevenue)
AS revenue_rank,
ROW_NUMBER() OVER (PARTITION BY CONCAT("20", _TABLE_SUFFIX) ORDER BY totfuncexpns)
AS expense_rank,
COUNT(*) OVER (PARTITION BY CONCAT("20", _TABLE_SUFFIX)) AS total_count
FROM
`bigquery-public-data.irs_990.irs_990_20*`
),
YearlyMedians AS (
SELECT
year_filed,
IF(MOD(total_count, 2) = 1,
MAX(CASE WHEN revenue_rank = (total_count + 1) / 2 THEN totrevenue END),
AVG(CASE WHEN revenue_rank IN ((total_count / 2), (total_count / 2) + 1) THEN totrevenue END)
) AS median_revenue,
IF(MOD(total_count, 2) = 1,
MAX(CASE WHEN expense_rank = (total_count + 1) / 2 THEN totfuncexpns END),
AVG(CASE WHEN expense_rank IN ((total_count / 2), (total_count / 2) + 1) THEN totfuncexpns END)
) AS median_expense
FROM
RankedData
GROUP BY
year_filed, total_count
),
DifferenceCalculations AS (
SELECT
year_filed,
median_revenue,
median_expense,
ABS(median_revenue - median_expense) AS difference
FROM
YearlyMedians
)
SELECT
year_filed,
difference
FROM
DifferenceCalculations
WHERE
year_filed BETWEEN '2012' AND '2017'
ORDER BY
difference ASC
LIMIT 3;
CREATE TEMP FUNCTION GrowthRate(end_value FLOAT64, begin_value FLOAT64)
RETURNS FLOAT64
AS ((end_value - begin_value) / begin_value);
SELECT
GrowthRate(SUM(IF(report_year=2024, race_asian, 0)),
SUM(IF(report_year=2014, race_asian, 0))) AS race_asian_growth,
GrowthRate(SUM(IF(report_year=2024, race_black, 0)),
SUM(IF(report_year=2014, race_black, 0))) AS race_black_growth,
GrowthRate(SUM(IF(report_year=2024, race_hispanic_latinx, 0)),
SUM(IF(report_year=2014, race_hispanic_latinx, 0))) AS race_hispanic_growth,
GrowthRate(SUM(IF(report_year=2024, race_native_american, 0)),
SUM(IF(report_year=2014, race_native_american, 0))) AS race_native_american_growth,
GrowthRate(SUM(IF(report_year=2024, race_white, 0)),
SUM(IF(report_year=2014, race_white, 0))) AS race_white_growth,
GrowthRate(SUM(IF(report_year=2024, gender_us_women, 0)),
SUM(IF(report_year=2014, gender_us_women, 0))) AS gender_us_women_growth,
GrowthRate(SUM(IF(report_year=2024, gender_us_men, 0)),
SUM(IF(report_year=2014, gender_us_men, 0))) AS gender_us_men_growth,
GrowthRate(SUM(IF(report_year=2024, gender_global_women, 0)),
SUM(IF(report_year=2014, gender_global_women, 0))) AS gender_global_women_growth,
GrowthRate(SUM(IF(report_year=2024, gender_global_men, 0)),
SUM(IF(report_year=2014, gender_global_men, 0))) AS gender_global_men_growth
FROM `bigquery-public-data.google_dei.dar_non_intersectional_representation`
WHERE report_year IN (2014, 2024)
AND workforce = 'overall';
WITH population_data AS (
SELECT
geo_id,
median_age,
total_pop
FROM
`bigquery-public-data.census_bureau_acs.county_2020_5yr`
WHERE
total_pop > 50000
),
covid_data AS (
SELECT
county_fips_code,
county_name,
state,
SUM(confirmed_cases) AS total_cases,
SUM(deaths) AS total_deaths
FROM
`bigquery-public-data.covid19_usafacts.summary`
WHERE
date = '2020-08-27'
GROUP BY
county_fips_code, county_name, state
)
SELECT
covid.county_name,
covid.state,
pop.median_age,
pop.total_pop,
(covid.total_cases / pop.total_pop * 100000) AS confirmed_cases_per_100000,
(covid.total_deaths / pop.total_pop * 100000) AS deaths_per_100000,
(covid.total_deaths / covid.total_cases * 100) AS case_fatality_rate
FROM
covid_data covid
JOIN
population_data pop ON covid.county_fips_code = pop.geo_id
ORDER BY
case_fatality_rate DESC
LIMIT 3;
SELECT
COALESCE(p.journal.title, p.proceedings_title.preferred, p.book_title.preferred, p.book_series_title.preferred) AS venue,
FROM
`bigquery-public-data.dimensions_ai_covid19.publications` p
LEFT JOIN
UNNEST(research_orgs) AS research_orgs_grids
LEFT JOIN
`bigquery-public-data.dimensions_ai_covid19.grid` grid
ON
grid.id=research_orgs_grids
WHERE
EXTRACT(YEAR FROM date_inserted) >= 2021
AND
grid.address.city = 'Qianjiang'
SELECT
a.object_id,
a.title,
FORMAT_TIMESTAMP('%Y-%m-%d', a.metadata_date) AS formatted_metadata_date
FROM `bigquery-public-data.the_met.objects` a
JOIN (
SELECT object_id,
cropHints.confidence AS cropConfidence
FROM `bigquery-public-data.the_met.vision_api_data`,
UNNEST(cropHintsAnnotation.cropHints) cropHints
) b
ON a.object_id = b.object_id
WHERE a.department = "The Libraries"
AND b.cropConfidence > 0.5
AND a.title LIKE "%book%"
WITH s80 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1980`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s81 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1981`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s82 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1982`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s83 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1983`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s84 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1984`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s85 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1985`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s86 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1986`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s87 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1987`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s88 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1988`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s89 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1989`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s90 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1990`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s91 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1991`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s92 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1992`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s93 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1993`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s94 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1994`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000),
s95 as
(SELECT state, COUNT(event_id) as num_events
FROM `bigquery-public-data.noaa_historic_severe_storms.storms_1995`
GROUP BY state
ORDER BY num_events DESC
LIMIT 1000)
SELECT s80.state,
s80.num_events + s81.num_events + s82.num_events + s83.num_events + s84.num_events + s85.num_events + s86.num_events + s87.num_events + s88.num_events + s89.num_events + s90.num_events + s91.num_events + s92.num_events + s93.num_events + s94.num_events + s95.num_events as total_events
FROM s80 FULL JOIN s81 ON s80.state = s81.state
FULL JOIN s82 ON s82.state = s81.state
FULL JOIN s83 ON s83.state = s81.state
FULL JOIN s84 ON s84.state = s81.state
FULL JOIN s85 ON s85.state = s81.state
FULL JOIN s86 ON s86.state = s81.state
FULL JOIN s87 ON s87.state = s81.state
FULL JOIN s88 ON s88.state = s81.state
FULL JOIN s89 ON s89.state = s81.state
FULL JOIN s90 ON s90.state = s81.state
FULL JOIN s91 ON s91.state = s81.state
FULL JOIN s92 ON s92.state = s81.state
FULL JOIN s93 ON s93.state = s81.state
FULL JOIN s94 ON s94.state = s81.state
FULL JOIN s95 ON s95.state = s81.state
ORDER BY total_events DESC
LIMIT 5;
SELECT DISTINCT
id.country_name,
--cs.region,
id.value AS debt,
--id.indicator_code
FROM (
SELECT
country_code,
region
FROM
`bigquery-public-data.world_bank_intl_debt.country_summary`
WHERE
region != "" ) cs
INNER JOIN (
SELECT
country_code,
country_name,
value,
indicator_code
FROM
`bigquery-public-data.world_bank_intl_debt.international_debt`
WHERE
indicator_code = "DT.AMT.DLXF.CD") id
ON
cs.country_code = id.country_code
ORDER BY
id.value DESC
LIMIT 10
SELECT *
FROM (
SELECT
molregno,
comp.company,
prod.trade_name,
prod.approval_date,
ROW_NUMBER() OVER(PARTITION BY molregno ORDER BY PARSE_DATE('%Y-%m-%d', prod.approval_date) DESC) rn
FROM bigquery-public-data.ebi_chembl.compound_records_23 AS cmpd_rec
JOIN bigquery-public-data.ebi_chembl.molecule_synonyms_23 AS ms USING (molregno)
JOIN bigquery-public-data.ebi_chembl.research_companies_23 AS comp USING (res_stem_id)
JOIN bigquery-public-data.ebi_chembl.formulations_23 AS form USING (molregno)
JOIN bigquery-public-data.ebi_chembl.products_23 AS prod USING (product_id)
) as subq
WHERE rn = 1 AND company = 'SanofiAventis'
WITH top_teams AS (
SELECT
team_market
FROM (
SELECT
team_market,
player_id AS id,
SUM(points_scored)
FROM
`bigquery-public-data.ncaa_basketball.mbb_pbp_sr`
WHERE
season >= 2010 AND season <=2018 AND period = 2
GROUP BY
game_id,
team_market,
player_id
HAVING
SUM(points_scored) >= 15) C
GROUP BY
team_market
HAVING
COUNT(DISTINCT id) > 5
ORDER BY
COUNT(DISTINCT id) DESC
LIMIT 5
)
SELECT
season,
round,
days_from_epoch,
game_date,
day,
'win' AS label,
win_seed AS seed,
win_market AS market,
win_name AS name,
win_alias AS alias,
win_school_ncaa AS school_ncaa,
lose_seed AS opponent_seed,
lose_market AS opponent_market,
lose_name AS opponent_name,
lose_alias AS opponent_alias,
lose_school_ncaa AS opponent_school_ncaa
FROM `bigquery-public-data.ncaa_basketball.mbb_historical_tournament_games`
JOIN top_teams ON top_teams.team_market = win_market
WHERE season >= 2010 AND season <=2018
UNION ALL
SELECT
season,
round,
days_from_epoch,
game_date,
day,
'loss' AS label,
lose_seed AS seed,
lose_market AS market,
lose_name AS name,
lose_alias AS alias,
lose_school_ncaa AS school_ncaa,
win_seed AS opponent_seed,
win_market AS opponent_market,
win_name AS opponent_name,
win_alias AS opponent_alias,
win_school_ncaa AS opponent_school_ncaa
FROM `bigquery-public-data.ncaa_basketball.mbb_historical_tournament_games`
JOIN top_teams ON top_teams.team_market = lose_market
WHERE season >= 2010 AND season <=2018
select
-- *,
greatest(heavy_atoms_1, heavy_atoms_2) as heavy_atoms_greatest,
greatest(publication_date_1, publication_date_2) as publication_date_greatest,
greatest(doc_id_1, doc_id_2) as doc_id_greatest,
case
when
standard_value_1 > standard_value_2 and
standard_relation_1 not in ('<', '<<') and
standard_relation_2 not in ('>', '>>')
then 'decrease'
when
standard_value_1 < standard_value_2 and
standard_relation_1 not in ('>', '>>') and
standard_relation_2 not in ('<', '<<')
then 'increase'
when
standard_value_1 = standard_value_2 and
standard_relation_1 in ('=', '~') and
standard_relation_2 in ('=', '~')
then 'no-change'
else null
end as standard_change,
to_hex(md5(to_json_string(struct(activity_id_1, activity_id_2)))) as mmp_delta_uuid,
to_hex(md5(to_json_string(struct(canonical_smiles_1, canonical_smiles_2, 5)))) as mmp_search_uuid
from (
select
act.assay_id,
act.standard_type,
act.activity_id as activity_id_1,
cast(act.standard_value as numeric) as standard_value_1,
act.standard_relation as standard_relation_1,
cast(act.pchembl_value as numeric) as pchembl_value_1,
count(*) over (partition by act.assay_id) as count_activities_1,
count(*) over (partition by act.assay_id, act.molregno) as duplicate_activities_1,
act.molregno as molregno_1,
com.canonical_smiles as canonical_smiles_1,
cast(cmp.heavy_atoms as int64) as heavy_atoms_1,
cast(d.doc_id as int64) as doc_id_1,
date(
coalesce(cast(d.year as int64), 1970),
coalesce(cast(floor(percent_rank() over (
partition by d.journal, d.year order by SAFE_CAST(d.first_page as int64)) * 11) as int64) + 1, 1),
coalesce(mod(cast(floor(percent_rank() over (
partition by d.journal, d.year order by SAFE_CAST(d.first_page as int64)) * 308) as int64), 28) + 1, 1)) as publication_date_1
FROM `bigquery-public-data.ebi_chembl.activities_29` act
join `bigquery-public-data.ebi_chembl.compound_structures_29` com using (molregno)
join `bigquery-public-data.ebi_chembl.compound_properties_29` cmp using (molregno)
left join `bigquery-public-data.ebi_chembl.docs_29` d using (doc_id)
where standard_type in (select distinct standard_type from`bigquery-public-data.ebi_chembl.activities_29` where pchembl_value is not null)
) a1
join (
select
act.assay_id,
act.standard_type,
act.activity_id as activity_id_2,
cast(act.standard_value as numeric) as standard_value_2,
act.standard_relation as standard_relation_2,
cast(act.pchembl_value as numeric) as pchembl_value_2,
count(*) over (partition by act.assay_id) as count_activities_2,
count(*) over (partition by act.assay_id, act.molregno) as duplicate_activities_2,
act.molregno as molregno_2,
com.canonical_smiles as canonical_smiles_2,
cast(cmp.heavy_atoms as int64) as heavy_atoms_2,
cast(d.doc_id as int64) as doc_id_2,
date(
coalesce(cast(d.year as int64), 1970),
coalesce(cast(floor(percent_rank() over (
partition by d.journal, d.year order by SAFE_CAST(d.first_page as int64)) * 11) as int64) + 1, 1),
coalesce(mod(cast(floor(percent_rank() over (
partition by d.journal, d.year order by SAFE_CAST(d.first_page as int64)) * 308) as int64), 28) + 1, 1)) as publication_date_2
FROM `bigquery-public-data.ebi_chembl.activities_29` act
join `bigquery-public-data.ebi_chembl.compound_structures_29` com using (molregno)
join `bigquery-public-data.ebi_chembl.compound_properties_29` cmp using (molregno)
left join `bigquery-public-data.ebi_chembl.docs_29` d using (doc_id)
where standard_type in (select distinct standard_type from`bigquery-public-data.ebi_chembl.activities_29` where pchembl_value is not null)
) a2 using (assay_id, standard_type)
where
a1.molregno_1 != a2.molregno_2 and
a1.count_activities_1 < 5 and
a2.count_activities_2 < 5 and
a1.heavy_atoms_1 between 10 and 15 and
a2.heavy_atoms_2 between 10 and 15 and
a1.standard_value_1 is not null and
a2.standard_value_2 is not null and
a1.duplicate_activities_1 < 2 and
a2.duplicate_activities_2 < 2 and
a1.pchembl_value_1 > 10 and
a2.pchembl_value_2 > 10
SELECT
OrderID AS tradeID,
MaturityDate AS tradeTimestamp,
(
CASE SUBSTR(TargetCompID, 0, 4)
WHEN 'MOMO' THEN 'Momentum'
WHEN 'LUCK' THEN 'Feeling Lucky'
WHEN 'PRED' THEN 'Prediction'
END
) AS algorithm,
Symbol AS symbol,
LastPx AS openPrice,
StrikePrice AS closePrice,
(
SELECT
Side
FROM
UNNEST(Sides)
) AS tradeDirection,
(CASE (
SELECT
Side
FROM
UNNEST(Sides))
WHEN 'SHORT' THEN -1
WHEN 'LONG' THEN 1
END
) AS tradeMultiplier
FROM
`bigquery-public-data.cymbal_investments.trade_capture_report`cv
ORDER BY closePrice DESC
LIMIT 6
SELECT *
FROM (
SELECT
`start`,
`end`,
ROUND(
POW(ABS(case_ref_count - (ref_count / allele_count) * case_count) - 0.5, 2) / ((ref_count / allele_count) * case_count) +
POW(ABS(control_ref_count - (ref_count / allele_count) * control_count) - 0.5, 2) / ((ref_count / allele_count) * control_count) +
POW(ABS(case_alt_count - (alt_count / allele_count) * case_count) - 0.5, 2) / ((alt_count / allele_count) * case_count) +
POW(ABS(control_alt_count - (alt_count / allele_count) * control_count) - 0.5, 2) / ((alt_count / allele_count) * control_count),
3
) AS chi_squared_score
FROM (
SELECT
reference_name,
`start`,
`end`,
reference_bases,
alternate_bases,
vt,
SUM(ref_count + alt_count) AS allele_count,
SUM(ref_count) AS ref_count,
SUM(alt_count) AS alt_count,
SUM(IF(is_case, CAST(ref_count + alt_count AS INT64), 0)) AS case_count,
SUM(IF(NOT is_case, CAST(ref_count + alt_count AS INT64), 0)) AS control_count,
SUM(IF(is_case, ref_count, 0)) AS case_ref_count,
SUM(IF(is_case, alt_count, 0)) AS case_alt_count,
SUM(IF(NOT is_case, ref_count, 0)) AS control_ref_count,
SUM(IF(NOT is_case, alt_count, 0)) AS control_alt_count
FROM (
SELECT
v.reference_name,
v.`start`,
v.`end`,
v.reference_bases,
v.alternate_bases,
v.vt,
('EAS' = p.super_population) AS is_case,
IF(call.genotype[SAFE_OFFSET(0)] = 0, 1, 0) AS ref_count,
IF(call.genotype[SAFE_OFFSET(0)] = 1, 1, 0) AS alt_count
FROM
`spider2-public-data.1000_genomes.variants` AS v,
UNNEST(v.call) AS call
JOIN
`spider2-public-data.1000_genomes.sample_info` AS p
ON
call.call_set_name = p.sample
WHERE
v.reference_name = '12'
)
GROUP BY
reference_name,
`start`,
`end`,
reference_bases,
alternate_bases,
vt
)
WHERE
(ref_count / allele_count) * case_count >= 5.0
AND (ref_count / allele_count) * control_count >= 5.0
AND (alt_count / allele_count) * case_count >= 5.0
AND (alt_count / allele_count) * control_count >= 5.0
)
WHERE
chi_squared_score >= 29.71679
ORDER BY
chi_squared_score DESC
SELECT
reference_name,
start,
`END`,
reference_bases,
alt,
vt,
POW(hom_ref_count - expected_hom_ref_count, 2) / expected_hom_ref_count +
POW(hom_alt_count - expected_hom_alt_count, 2) / expected_hom_alt_count +
POW(het_count - expected_het_count, 2) / expected_het_count AS chi_squared_score,
total_count,
hom_ref_count,
expected_hom_ref_count AS expected_hom_ref_count,
het_count,
expected_het_count AS expected_het_count,
hom_alt_count,
expected_hom_alt_count AS expected_hom_alt_count,
alt_freq AS alt_freq,
alt_freq_from_1KG
FROM (
SELECT
reference_name,
start,
`END`,
reference_bases,
alt,
vt,
alt_freq_from_1KG,
hom_ref_freq + (0.5 * het_freq) AS hw_ref_freq,
1 - (hom_ref_freq + (0.5 * het_freq)) AS alt_freq,
POW(hom_ref_freq + (0.5 * het_freq), 2) * total_count AS expected_hom_ref_count,
POW(1 - (hom_ref_freq + (0.5 * het_freq)), 2) * total_count AS expected_hom_alt_count,
2 * (hom_ref_freq + (0.5 * het_freq)) * (1 - (hom_ref_freq + (0.5 * het_freq))) * total_count AS expected_het_count,
total_count,
hom_ref_count,
het_count,
hom_alt_count,
hom_ref_freq,
het_freq,
hom_alt_freq
FROM (
SELECT
reference_name,
start,
`END`,
reference_bases,
STRING_AGG(DISTINCT alternate_base) AS alt,
vt,
af AS alt_freq_from_1KG,
COUNTIF(first_allele IN (0, 1) AND second_allele IN (0, 1)) AS total_count,
COUNTIF(first_allele = 0 AND second_allele = 0) AS hom_ref_count,
COUNTIF((first_allele = 0 AND second_allele = 1) OR (first_allele = 1 AND second_allele = 0)) AS het_count,
COUNTIF(first_allele = 1 AND second_allele = 1) AS hom_alt_count,
SAFE_DIVIDE(COUNTIF(first_allele = 0 AND second_allele = 0), COUNTIF(first_allele IN (0, 1) AND second_allele IN (0, 1))) AS hom_ref_freq,
SAFE_DIVIDE(COUNTIF((first_allele = 0 AND second_allele = 1) OR (first_allele = 1 AND second_allele = 0)), COUNTIF(first_allele IN (0, 1) AND second_allele IN (0, 1))) AS het_freq,
SAFE_DIVIDE(COUNTIF(first_allele = 1 AND second_allele = 1), COUNTIF(first_allele IN (0, 1) AND second_allele IN (0, 1))) AS hom_alt_freq
FROM (
SELECT
reference_name,
start,
`END`,
reference_bases,
vt,
af,
call.call_set_name,
call.genotype[OFFSET(0)] AS first_allele,
call.genotype[OFFSET(1)] AS second_allele,
alternate_base
FROM
`spider2-public-data.1000_genomes.variants`,
UNNEST(call) AS call,
UNNEST(alternate_bases) AS alternate_base
WHERE
reference_name = '17'
AND start BETWEEN 41196311 AND 41277499
)
GROUP BY
reference_name,
start,
`END`,
reference_bases,
vt,
af
)
)
WITH
Params AS (
SELECT 'Google Navy Speckled Tee' AS selected_product
),
PurchaseEvents AS (
SELECT
user_pseudo_id,
items
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE
_TABLE_SUFFIX BETWEEN '20201201' AND '20201231'
AND event_name = 'purchase'
),
ProductABuyers AS (
SELECT DISTINCT
user_pseudo_id
FROM
Params,
PurchaseEvents,
UNNEST(items) AS items
WHERE
items.item_name = selected_product
)
SELECT
items.item_name AS item_name,
SUM(items.quantity) AS item_quantity
FROM
Params,
PurchaseEvents,
UNNEST(items) AS items
WHERE
user_pseudo_id IN (SELECT user_pseudo_id FROM ProductABuyers)
AND items.item_name != selected_product
GROUP BY 1
ORDER BY item_quantity DESC
LIMIT 1;
WITH
Params AS (
SELECT 'Google Red Speckled Tee' AS selected_product
),
DateRanges AS (
SELECT '20201101' AS start_date, '20201130' AS end_date, '202011' AS period UNION ALL
SELECT '20201201', '20201231', '202012' UNION ALL
SELECT '20210101', '20210131', '202101'
),
PurchaseEvents AS (
SELECT
period,
user_pseudo_id,
items
FROM
DateRanges
JOIN
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
ON _TABLE_SUFFIX BETWEEN start_date AND end_date
WHERE
event_name = 'purchase'
),
ProductABuyers AS (
SELECT DISTINCT
period,
user_pseudo_id
FROM
Params,
PurchaseEvents,
UNNEST(items) AS items
WHERE
items.item_name = selected_product
),
TopProducts AS (
SELECT
pe.period,
items.item_name AS item_name,
SUM(items.quantity) AS item_quantity
FROM
Params,
PurchaseEvents pe,
UNNEST(items) AS items
WHERE
user_pseudo_id IN (SELECT user_pseudo_id FROM ProductABuyers pb WHERE pb.period = pe.period)
AND items.item_name != selected_product
GROUP BY
pe.period, items.item_name
),
TopProductPerPeriod AS (
SELECT
period,
item_name,
item_quantity
FROM (
SELECT
period,
item_name,
item_quantity,
RANK() OVER (PARTITION BY period ORDER BY item_quantity DESC) AS rank
FROM
TopProducts
)
WHERE
rank = 1
)
SELECT
period,
item_name,
item_quantity
FROM
TopProductPerPeriod
ORDER BY
period;
WITH EventData AS (
SELECT
user_pseudo_id,
event_timestamp,
param
FROM
`firebase-public-project.analytics_153293282.events_20180915`,
UNNEST(event_params) AS param
WHERE
event_name = "level_complete_quickplay"
AND (param.key = "value" OR param.key = "board")
),
ProcessedData AS (
SELECT
user_pseudo_id,
event_timestamp,
MAX(IF(param.key = "value", param.value.int_value, NULL)) AS score,
MAX(IF(param.key = "board", param.value.string_value, NULL)) AS board_type
FROM
EventData
GROUP BY
user_pseudo_id,
event_timestamp
)
SELECT
ANY_VALUE(board_type) AS board,
AVG(score) AS average_score
FROM
ProcessedData
GROUP BY
board_type
WITH
UserInfo AS (
SELECT
user_pseudo_id,
COUNTIF(event_name = 'page_view') AS page_view_count,
COUNTIF(event_name IN ('in_app_purchase', 'purchase')) AS purchase_event_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _TABLE_SUFFIX BETWEEN '20201201' AND '20201231'
GROUP BY 1
),
Averages AS (
SELECT
(purchase_event_count > 0) AS purchaser,
COUNT(*) AS user_count,
SUM(page_view_count) AS total_page_views,
SUM(page_view_count) / COUNT(*) AS avg_page_views
FROM UserInfo
GROUP BY 1
)
SELECT
MAX(CASE WHEN purchaser THEN avg_page_views ELSE 0 END) -
MAX(CASE WHEN NOT purchaser THEN avg_page_views ELSE 0 END) AS avg_page_views_difference
FROM Averages;
WITH
UserInfo AS (
SELECT
user_pseudo_id,
PARSE_DATE('%Y%m%d', event_date) AS event_date,
COUNTIF(event_name = 'page_view') AS page_view_count,
COUNTIF(event_name = 'purchase') AS purchase_event_count
FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE _TABLE_SUFFIX BETWEEN '20201101' AND '20201130'
GROUP BY 1, 2
)
SELECT
event_date,
SUM(page_view_count) / COUNT(*) AS avg_page_views,
SUM(page_view_count)
FROM UserInfo
WHERE purchase_event_count > 0
GROUP BY event_date
ORDER BY event_date;
WITH prep AS (
SELECT
user_pseudo_id,
(SELECT value.int_value FROM UNNEST(event_params) WHERE key = 'ga_session_id') AS session_id,
ARRAY_AGG((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'source') IGNORE NULLS
ORDER BY event_timestamp)[SAFE_OFFSET(0)] AS source,
ARRAY_AGG((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'medium') IGNORE NULLS
ORDER BY event_timestamp)[SAFE_OFFSET(0)] AS medium,
ARRAY_AGG((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'campaign') IGNORE NULLS
ORDER BY event_timestamp)[SAFE_OFFSET(0)] AS campaign
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE
_TABLE_SUFFIX BETWEEN '20201201' AND '20201231'
GROUP BY
user_pseudo_id,
session_id
)
SELECT
-- session default channel grouping (dimension | the channel group associated with a session)
CASE
WHEN source = '(direct)' AND (medium IN ('(not set)','(none)')) THEN 'Direct'
WHEN REGEXP_CONTAINS(campaign, 'cross-network') THEN 'Cross-network'
WHEN (REGEXP_CONTAINS(source,'alibaba|amazon|google shopping|shopify|etsy|ebay|stripe|walmart')
OR REGEXP_CONTAINS(campaign, '^(.*(([^a-df-z]|^)shop|shopping).*)$'))
AND REGEXP_CONTAINS(medium, '^(.*cp.*|ppc|paid.*)$') THEN 'Paid Shopping'
WHEN REGEXP_CONTAINS(source,'baidu|bing|duckduckgo|ecosia|google|yahoo|yandex')
AND REGEXP_CONTAINS(medium,'^(.*cp.*|ppc|paid.*)$') THEN 'Paid Search'
WHEN REGEXP_CONTAINS(source,'badoo|facebook|fb|instagram|linkedin|pinterest|tiktok|twitter|whatsapp')
AND REGEXP_CONTAINS(medium,'^(.*cp.*|ppc|paid.*)$') THEN 'Paid Social'
WHEN REGEXP_CONTAINS(source,'dailymotion|disneyplus|netflix|youtube|vimeo|twitch|vimeo|youtube')
AND REGEXP_CONTAINS(medium,'^(.*cp.*|ppc|paid.*)$') THEN 'Paid Video'
WHEN medium IN ('display', 'banner', 'expandable', 'interstitial', 'cpm') THEN 'Display'
WHEN REGEXP_CONTAINS(source,'alibaba|amazon|google shopping|shopify|etsy|ebay|stripe|walmart')
OR REGEXP_CONTAINS(campaign, '^(.*(([^a-df-z]|^)shop|shopping).*)$') THEN 'Organic Shopping'
WHEN REGEXP_CONTAINS(source,'badoo|facebook|fb|instagram|linkedin|pinterest|tiktok|twitter|whatsapp')
OR medium IN ('social','social-network','social-media','sm','social network','social media') THEN 'Organic Social'
WHEN REGEXP_CONTAINS(source,'dailymotion|disneyplus|netflix|youtube|vimeo|twitch|vimeo|youtube')
OR REGEXP_CONTAINS(medium,'^(.*video.*)$') THEN 'Organic Video'
WHEN REGEXP_CONTAINS(source,'baidu|bing|duckduckgo|ecosia|google|yahoo|yandex')
OR medium = 'organic' THEN 'Organic Search'
WHEN REGEXP_CONTAINS(source,'email|e-mail|e_mail|e mail')
OR REGEXP_CONTAINS(medium,'email|e-mail|e_mail|e mail') THEN 'Email'
WHEN medium = 'affiliate' THEN 'Affiliates'
WHEN medium = 'referral' THEN 'Referral'
WHEN medium = 'audio' THEN 'Audio'
WHEN medium = 'sms' THEN 'SMS'
WHEN medium LIKE '%push'
OR REGEXP_CONTAINS(medium,'mobile|notification') THEN 'Mobile Push Notifications'
ELSE 'Unassigned'
END AS channel_grouping_session
FROM
prep
GROUP BY
channel_grouping_session
ORDER BY
COUNT(DISTINCT CONCAT(user_pseudo_id, session_id)) DESC
LIMIT 1 OFFSET 3
WITH top_category AS (
SELECT
product.item_category,
SUM(ecommerce.tax_value_in_usd) / SUM(ecommerce.purchase_revenue_in_usd) AS tax_rate
FROM
bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_20201130,
UNNEST(items) AS product
WHERE
event_name = 'purchase'
GROUP BY
product.item_category
ORDER BY
tax_rate DESC
LIMIT 1
)
SELECT
ecommerce.transaction_id,
SUM(ecommerce.total_item_quantity) AS total_item_quantity,
SUM(ecommerce.purchase_revenue_in_usd) AS purchase_revenue_in_usd,
SUM(ecommerce.purchase_revenue) AS purchase_revenue
FROM
bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_20201130,
UNNEST(items) AS product
JOIN top_category
ON product.item_category = top_category.item_category
WHERE
event_name = 'purchase'
GROUP BY
ecommerce.transaction_id;
WITH unnested_events AS (
SELECT
MAX(CASE WHEN event_params.key = 'page_location' THEN event_params.value.string_value END) AS page_location,
user_pseudo_id,
event_timestamp
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`,
UNNEST(event_params) AS event_params
WHERE
_TABLE_SUFFIX BETWEEN '20210101' AND '20210131'
AND event_name = 'page_view'
GROUP BY user_pseudo_id,event_timestamp
),
temp AS (
SELECT
page_location,
COUNT(*) AS event_count,
COUNT(DISTINCT user_pseudo_id) AS users
FROM
unnested_events
GROUP BY page_location
ORDER BY event_count DESC
)
SELECT users
FROM
temp
LIMIT 1
WITH base_table AS (
SELECT
event_name,
event_date,
event_timestamp,
user_pseudo_id,
user_id,
device,
geo,
traffic_source,
event_params,
user_properties
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE
_table_suffix = '20210102'
AND event_name IN ('page_view')
)
, unnested_events AS (
-- unnests event parameters to get to relevant keys and values
SELECT
event_date AS date,
event_timestamp AS event_timestamp_microseconds,
user_pseudo_id,
MAX(CASE WHEN c.key = 'ga_session_id' THEN c.value.int_value END) AS visitID,
MAX(CASE WHEN c.key = 'ga_session_number' THEN c.value.int_value END) AS visitNumber,
MAX(CASE WHEN c.key = 'page_title' THEN c.value.string_value END) AS page_title,
MAX(CASE WHEN c.key = 'page_location' THEN c.value.string_value END) AS page_location
FROM
base_table,
UNNEST (event_params) c
GROUP BY 1,2,3
)
, unnested_events_categorised AS (
-- categorizing Page Titles into PDPs and PLPs
SELECT
*,
CASE WHEN ARRAY_LENGTH(SPLIT(page_location, '/')) >= 5
AND
CONTAINS_SUBSTR(ARRAY_REVERSE(SPLIT(page_location, '/'))[SAFE_OFFSET(0)], '+')
AND (LOWER(SPLIT(page_location, '/')[SAFE_OFFSET(4)]) IN
('accessories','apparel','brands','campus+collection','drinkware',
'electronics','google+redesign',
'lifestyle','nest','new+2015+logo','notebooks+journals',
'office','shop+by+brand','small+goods','stationery','wearables'
)
OR
LOWER(SPLIT(page_location, '/')[SAFE_OFFSET(3)]) IN
('accessories','apparel','brands','campus+collection','drinkware',
'electronics','google+redesign',
'lifestyle','nest','new+2015+logo','notebooks+journals',
'office','shop+by+brand','small+goods','stationery','wearables'
)
)
THEN 'PDP'
WHEN NOT(CONTAINS_SUBSTR(ARRAY_REVERSE(SPLIT(page_location, '/'))[SAFE_OFFSET(0)], '+'))
AND (LOWER(SPLIT(page_location, '/')[SAFE_OFFSET(4)]) IN
('accessories','apparel','brands','campus+collection','drinkware',
'electronics','google+redesign',
'lifestyle','nest','new+2015+logo','notebooks+journals',
'office','shop+by+brand','small+goods','stationery','wearables'
)
OR
LOWER(SPLIT(page_location, '/')[SAFE_OFFSET(3)]) IN
('accessories','apparel','brands','campus+collection','drinkware',
'electronics','google+redesign',
'lifestyle','nest','new+2015+logo','notebooks+journals',
'office','shop+by+brand','small+goods','stationery','wearables'
)
)
THEN 'PLP'
ELSE page_title
END AS page_title_adjusted
FROM
unnested_events
)
, ranked_screens AS (
SELECT
*,
LAG(page_title_adjusted,1) OVER (PARTITION BY user_pseudo_id, visitID ORDER BY event_timestamp_microseconds ASC) previous_page,
LEAD(page_title_adjusted,1) OVER (PARTITION BY user_pseudo_id, visitID ORDER BY event_timestamp_microseconds ASC) next_page
FROM
unnested_events_categorised
)
,PLPtoPDPTransitions AS (
SELECT
user_pseudo_id,
visitID
FROM
ranked_screens
WHERE
page_title_adjusted = 'PLP' AND next_page = 'PDP'
)
,TotalPLPViews AS (
SELECT
COUNT(*) AS total_plp_views
FROM
ranked_screens
WHERE
page_title_adjusted = 'PLP'
)
,TotalTransitions AS (
SELECT
COUNT(*) AS total_transitions
FROM
PLPtoPDPTransitions
)
SELECT
(total_transitions * 100.0) / total_plp_views AS percentage
FROM
TotalTransitions, TotalPLPViews;
WITH
--List of users who installed
sept_cohort AS (
SELECT DISTINCT user_pseudo_id,
FORMAT_DATE('%Y-%m-%d', PARSE_DATE('%Y%m%d', event_date)) AS date_first_open,
FROM `firebase-public-project.analytics_153293282.events_*`
WHERE event_name = 'first_open'
AND _TABLE_SUFFIX BETWEEN '20180801' and '20180930'
),
--Get the list of users who uninstalled
uninstallers AS (
SELECT DISTINCT user_pseudo_id,
FORMAT_DATE('%Y-%m-%d', PARSE_DATE('%Y%m%d', event_date)) AS date_app_remove,
FROM `firebase-public-project.analytics_153293282.events_*`
WHERE event_name = 'app_remove'
AND _TABLE_SUFFIX BETWEEN '20180801' and '20180930'
),
--Join the 2 tables and compute for # of days to uninstall
joined AS (
SELECT a.*,
b.date_app_remove,
DATE_DIFF(DATE(b.date_app_remove), DATE(a.date_first_open), DAY) AS days_to_uninstall
FROM sept_cohort a
LEFT JOIN uninstallers b
ON a.user_pseudo_id = b.user_pseudo_id
)
--Compute for the percentage
SELECT
COUNT(DISTINCT
CASE WHEN days_to_uninstall > 7 OR days_to_uninstall IS NULL THEN user_pseudo_id END) /
COUNT(DISTINCT user_pseudo_id)
AS percent_users_7_days
FROM joined
-- Define the date range and calculate the minimum date for filtering results
WITH dates AS (
SELECT
DATE('2018-08-01') AS start_date,
DATE('2018-08-15') AS end_date
),
-- Create a table of active dates for each user within the specified date range
dates_active_table AS (
SELECT
user_pseudo_id,
PARSE_DATE('%Y%m%d', `event_date`) AS user_active_date
FROM
`firebase-public-project.analytics_153293282.events_*`
WHERE
event_name = 'session_start'
AND PARSE_DATE('%Y%m%d', `event_date`) BETWEEN (SELECT start_date FROM dates) AND (SELECT end_date FROM dates)
GROUP BY
user_pseudo_id, user_active_date
),
-- Create a table of the earliest quickplay event date for each user within the specified date range
event_table AS (
SELECT
user_pseudo_id,
event_name,
MIN(PARSE_DATE('%Y%m%d', `event_date`)) AS event_cohort_date
FROM
`firebase-public-project.analytics_153293282.events_*`
WHERE
event_name IN ('level_start_quickplay', 'level_end_quickplay', 'level_complete_quickplay',
'level_fail_quickplay', 'level_reset_quickplay', 'level_retry_quickplay')
AND PARSE_DATE('%Y%m%d', `event_date`) BETWEEN (SELECT start_date FROM dates) AND (SELECT end_date FROM dates)
GROUP BY
user_pseudo_id, event_name
),
-- Calculate the number of days since each user's initial quickplay event
days_since_event_table AS (
SELECT
events.user_pseudo_id,
events.event_name AS event_cohort,
events.event_cohort_date,
days.user_active_date,
DATE_DIFF(days.user_active_date, events.event_cohort_date, DAY) AS days_since_event
FROM
event_table events
LEFT JOIN
dates_active_table days ON events.user_pseudo_id = days.user_pseudo_id
WHERE
events.event_cohort_date <= days.user_active_date
),
-- Calculate the weeks since each user's initial quickplay event and count the active days in each week
weeks_retention AS (
SELECT
event_cohort,
user_pseudo_id,
CAST(CASE WHEN days_since_event = 0 THEN 0 ELSE CEIL(days_since_event / 7) END AS INTEGER) AS weeks_since_event,
COUNT(DISTINCT days_since_event) AS days_active_since_event -- Count Days Active in Week
FROM
days_since_event_table
GROUP BY
event_cohort, user_pseudo_id, weeks_since_event
),
-- Aggregate the weekly retention data
aggregated_weekly_retention_table AS (
SELECT
event_cohort,
weeks_since_event,
SUM(days_active_since_event) AS weekly_days_active,
COUNT(DISTINCT user_pseudo_id) AS retained_users
FROM
weeks_retention
GROUP BY
event_cohort, weeks_since_event
),
RETENTION_INFO AS (
-- Select and calculate the weekly retention rate for each event cohort
SELECT
event_cohort,
weeks_since_event,
weekly_days_active,
retained_users,
(retained_users / MAX(retained_users) OVER (PARTITION BY event_cohort)) AS retention_rate
FROM
aggregated_weekly_retention_table
ORDER BY
event_cohort, weeks_since_event
)
SELECT event_cohort
FROM
RETENTION_INFO
WHERE weeks_since_event = 2
ORDER BY retention_rate
LIMIT 1
-- Define the date range and calculate the minimum date for filtering results
WITH dates AS (
SELECT
DATE('2018-07-02') AS start_date,
DATE('2018-07-16') AS end_date
),
-- Create a table of active dates for each user within the specified date range
dates_active_table AS (
SELECT
user_pseudo_id,
PARSE_DATE('%Y%m%d', `event_date`) AS user_active_date
FROM
`firebase-public-project.analytics_153293282.events_*`
WHERE
event_name = 'session_start'
AND PARSE_DATE('%Y%m%d', `event_date`) BETWEEN (SELECT start_date FROM dates) AND (SELECT end_date FROM dates)
GROUP BY
user_pseudo_id, user_active_date
),
-- Create a table of the earliest quickplay event date for each user within the specified date range
event_table AS (
SELECT
user_pseudo_id,
event_name,
MIN(PARSE_DATE('%Y%m%d', `event_date`)) AS event_cohort_date
FROM
`firebase-public-project.analytics_153293282.events_*`
WHERE
event_name IN ('level_start_quickplay', 'level_end_quickplay', 'level_complete_quickplay',
'level_fail_quickplay', 'level_reset_quickplay', 'level_retry_quickplay')
AND PARSE_DATE('%Y%m%d', `event_date`) BETWEEN (SELECT start_date FROM dates) AND (SELECT end_date FROM dates)
GROUP BY
user_pseudo_id, event_name
),
-- Calculate the number of days since each user's initial quickplay event
days_since_event_table AS (
SELECT
events.user_pseudo_id,
events.event_name AS event_cohort,
events.event_cohort_date,
days.user_active_date,
DATE_DIFF(days.user_active_date, events.event_cohort_date, DAY) AS days_since_event
FROM
event_table events
LEFT JOIN
dates_active_table days ON events.user_pseudo_id = days.user_pseudo_id
WHERE
events.event_cohort_date <= days.user_active_date
),
-- Calculate the weeks since each user's initial quickplay event and count the active days in each week
weeks_retention AS (
SELECT
event_cohort,
user_pseudo_id,
CAST(CASE WHEN days_since_event = 0 THEN 0 ELSE CEIL(days_since_event / 7) END AS INTEGER) AS weeks_since_event,
COUNT(DISTINCT days_since_event) AS days_active_since_event -- Count Days Active in Week
FROM
days_since_event_table
GROUP BY
event_cohort, user_pseudo_id, weeks_since_event
),
-- Aggregate the weekly retention data
aggregated_weekly_retention_table AS (
SELECT
event_cohort,
weeks_since_event,
SUM(days_active_since_event) AS weekly_days_active,
COUNT(DISTINCT user_pseudo_id) AS retained_users
FROM
weeks_retention
GROUP BY
event_cohort, weeks_since_event
),
RETENTION_INFO AS (
SELECT
event_cohort,
weeks_since_event,
weekly_days_active,
retained_users,
(retained_users / MAX(retained_users) OVER (PARTITION BY event_cohort)) AS retention_rate
FROM
aggregated_weekly_retention_table
ORDER BY
event_cohort, weeks_since_event
)
SELECT event_cohort, retention_rate
FROM
RETENTION_INFO
WHERE weeks_since_event = 2
WITH analytics_data AS (
SELECT user_pseudo_id, event_timestamp, event_name,
UNIX_MICROS(TIMESTAMP("2018-09-01 00:00:00", "+8:00")) AS start_day,
3600*1000*1000*24*7 AS one_week_micros
FROM `firebase-public-project.analytics_153293282.events_*`
WHERE _table_suffix BETWEEN '20180901' AND '20180930'
)
SELECT
week_1_cohort / week_0_cohort AS week_1_pct,
week_2_cohort / week_0_cohort AS week_2_pct,
week_3_cohort / week_0_cohort AS week_3_pct
FROM (
WITH week_3_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_timestamp BETWEEN start_day+(3*one_week_micros) AND start_day+(4*one_week_micros)
),
week_2_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_timestamp BETWEEN start_day+(2*one_week_micros) AND start_day+(3*one_week_micros)
),
week_1_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_timestamp BETWEEN start_day+(1*one_week_micros) AND start_day+(2*one_week_micros)
),
week_0_users AS (
SELECT DISTINCT user_pseudo_id
FROM analytics_data
WHERE event_name = 'first_open'
AND event_timestamp BETWEEN start_day AND start_day+(1*one_week_micros)
)
SELECT
(SELECT count(*)
FROM week_0_users) AS week_0_cohort,
(SELECT count(*)
FROM week_1_users
JOIN week_0_users USING (user_pseudo_id)) AS week_1_cohort,
(SELECT count(*)
FROM week_2_users
JOIN week_0_users USING (user_pseudo_id)) AS week_2_cohort,
(SELECT count(*)
FROM week_3_users
JOIN week_0_users USING (user_pseudo_id)) AS week_3_cohort
)
WITH dates AS (
SELECT
DATE('2018-07-02') AS start_date,
DATE('2018-10-02') AS end_date,
DATE_ADD(DATE_TRUNC(DATE('2018-10-02'), WEEK(TUESDAY)), INTERVAL -4 WEEK) AS min_date
),
date_table AS (
SELECT DISTINCT
PARSE_DATE('%Y%m%d', `event_date`) AS event_date,
user_pseudo_id,
CASE
WHEN DATE_DIFF(PARSE_DATE('%Y%m%d', `event_date`), DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)), DAY) = 0
THEN 1
ELSE 0
END AS is_new_user
FROM
`firebase-public-project.analytics_153293282.events_*`
WHERE
event_name = 'session_start'
),
new_user_list AS (
SELECT DISTINCT
user_pseudo_id,
event_date
FROM
date_table
WHERE
is_new_user = 1
),
days_since_start_table AS (
SELECT DISTINCT
is_new_user,
nu.event_date AS date_cohort,
dt.user_pseudo_id,
dt.event_date,
DATE_DIFF(dt.event_date, nu.event_date, DAY) AS days_since_start
FROM
date_table dt
JOIN
new_user_list nu ON dt.user_pseudo_id = nu.user_pseudo_id
),
weeks_retention AS (
SELECT
date_cohort,
DATE_TRUNC(date_cohort, WEEK(MONDAY)) AS week_cohort,
user_pseudo_id,
days_since_start,
CASE
WHEN days_since_start = 0
THEN 0
ELSE CEIL(days_since_start / 7)
END AS weeks_since_start
FROM
days_since_start_table
),
RETENTION_INFO AS (
SELECT
week_cohort,
weeks_since_start,
COUNT(DISTINCT user_pseudo_id) AS retained_users
FROM
weeks_retention
WHERE
week_cohort <= (SELECT min_date FROM dates)
GROUP BY
week_cohort,
weeks_since_start
HAVING
weeks_since_start <= 4
ORDER BY
week_cohort,
weeks_since_start
)
SELECT weeks_since_start, retained_users
FROM RETENTION_INFO
WHERE week_cohort = DATE('2018-07-02')
WITH RecencyScore AS (
SELECT customer_unique_id,
MAX(order_purchase_timestamp) AS last_purchase,
NTILE(5) OVER (ORDER BY MAX(order_purchase_timestamp) DESC) AS recency
FROM orders
JOIN customers USING (customer_id)
WHERE order_status = 'delivered'
GROUP BY customer_unique_id
),
FrequencyScore AS (
SELECT customer_unique_id,
COUNT(order_id) AS total_orders,
NTILE(5) OVER (ORDER BY COUNT(order_id) DESC) AS frequency
FROM orders
JOIN customers USING (customer_id)
WHERE order_status = 'delivered'
GROUP BY customer_unique_id
),
MonetaryScore AS (
SELECT customer_unique_id,
SUM(price) AS total_spent,
NTILE(5) OVER (ORDER BY SUM(price) DESC) AS monetary
FROM orders
JOIN order_items USING (order_id)
JOIN customers USING (customer_id)
WHERE order_status = 'delivered'
GROUP BY customer_unique_id
),
-- 2. Assign each customer to a group
RFM AS (
SELECT last_purchase, total_orders, total_spent,
CASE
WHEN recency = 1 AND frequency + monetary IN (1, 2, 3, 4) THEN "Champions"
WHEN recency IN (4, 5) AND frequency + monetary IN (1, 2) THEN "Can't Lose Them"
WHEN recency IN (4, 5) AND frequency + monetary IN (3, 4, 5, 6) THEN "Hibernating"
WHEN recency IN (4, 5) AND frequency + monetary IN (7, 8, 9, 10) THEN "Lost"
WHEN recency IN (2, 3) AND frequency + monetary IN (1, 2, 3, 4) THEN "Loyal Customers"
WHEN recency = 3 AND frequency + monetary IN (5, 6) THEN "Needs Attention"
WHEN recency = 1 AND frequency + monetary IN (7, 8) THEN "Recent Users"
WHEN recency = 1 AND frequency + monetary IN (5, 6) OR
recency = 2 AND frequency + monetary IN (5, 6, 7, 8) THEN "Potentital Loyalists"
WHEN recency = 1 AND frequency + monetary IN (9, 10) THEN "Price Sensitive"
WHEN recency = 2 AND frequency + monetary IN (9, 10) THEN "Promising"
WHEN recency = 3 AND frequency + monetary IN (7, 8, 9, 10) THEN "About to Sleep"
END AS RFM_Bucket
FROM RecencyScore
JOIN FrequencyScore USING (customer_unique_id)
JOIN MonetaryScore USING (customer_unique_id)
)
SELECT RFM_Bucket,
AVG(total_spent / total_orders) AS avg_sales_per_customer
FROM RFM
GROUP BY RFM_Bucket
WITH CustomerData AS (
SELECT
customer_unique_id,
COUNT(DISTINCT orders.order_id) AS order_count,
SUM(payment_value) AS total_payment,
JULIANDAY(MIN(order_purchase_timestamp)) AS first_order_day,
JULIANDAY(MAX(order_purchase_timestamp)) AS last_order_day
FROM customers
JOIN orders USING (customer_id)
JOIN order_payments USING (order_id)
GROUP BY customer_unique_id
)
SELECT
customer_unique_id,
order_count AS PF,
ROUND(total_payment / order_count, 2) AS AOV,
CASE
WHEN (last_order_day - first_order_day) < 7 THEN
1
ELSE
(last_order_day - first_order_day) / 7
END AS ACL
FROM CustomerData
ORDER BY AOV DESC
LIMIT 3
WITH player_stats AS (
SELECT
b.player_id,
p.name_given AS player_name,
SUM(b.g) AS games_played,
SUM(b.r) AS runs,
SUM(b.h) AS hits,
SUM(b.hr) AS home_runs
FROM player p
JOIN batting b ON p.player_id = b.player_id
GROUP BY b.player_id, p.name_given
)
SELECT 'Games Played' AS Category, player_name AS Player_Name, games_played AS Batting_Table_Topper
FROM player_stats
WHERE games_played = (SELECT MAX(games_played) FROM player_stats)
UNION ALL
SELECT 'Runs' AS Category, player_name AS Player_Name, runs AS Batting_Table_Topper
FROM player_stats
WHERE runs = (SELECT MAX(runs) FROM player_stats)
UNION ALL
SELECT 'Hits' AS Category, player_name AS Player_Name, hits AS Batting_Table_Topper
FROM player_stats
WHERE hits = (SELECT MAX(hits) FROM player_stats)
UNION ALL
SELECT 'Home Runs' AS Category, player_name AS Player_Name, home_runs AS Batting_Table_Topper
FROM player_stats
WHERE home_runs = (SELECT MAX(home_runs) FROM player_stats);
WITH AnnualTotals AS (
SELECT
STRFTIME('%Y', collision_date) AS Year,
COUNT(case_id) AS AnnualTotal
FROM
collisions
GROUP BY
Year
),
CategoryTotals AS (
SELECT
STRFTIME('%Y', collision_date) AS Year,
pcf_violation_category AS Category,
COUNT(case_id) AS Subtotal
FROM
collisions
GROUP BY
Year, Category
),
CategoryPercentages AS (
SELECT
ct.Year,
ct.Category,
ROUND((ct.Subtotal * 100.0) / at.AnnualTotal, 1) AS PercentageOfAnnualRoadIncidents
FROM
CategoryTotals ct
JOIN
AnnualTotals at ON ct.Year = at.Year
),
RankedCategories AS (
SELECT
Year,
Category,
PercentageOfAnnualRoadIncidents,
ROW_NUMBER() OVER (PARTITION BY Year ORDER BY PercentageOfAnnualRoadIncidents DESC) AS Rank
FROM
CategoryPercentages
),
TopTwoCategories AS (
SELECT
Year,
GROUP_CONCAT(Category, ', ') AS TopCategories
FROM
RankedCategories
WHERE
Rank <= 2
GROUP BY
Year
),
UniqueYear AS (
SELECT
Year
FROM
TopTwoCategories
GROUP BY
TopCategories
HAVING COUNT(Year) = 1
),
results AS (
SELECT
rc.Year,
rc.Category,
rc.PercentageOfAnnualRoadIncidents
FROM
UniqueYear u
JOIN
RankedCategories rc ON u.Year = rc.Year
WHERE
rc.Rank <= 2
)
SELECT distinct Year FROM results
WITH MatchDetails AS (
SELECT
b.name AS titles,
m.duration AS match_duration,
w1.name || ' vs ' || w2.name AS matches,
m.win_type AS win_type,
l.name AS location,
e.name AS event,
ROW_NUMBER() OVER (PARTITION BY b.name ORDER BY m.duration ASC) AS rank
FROM
Belts b
INNER JOIN Matches m ON m.title_id = b.id
INNER JOIN Wrestlers w1 ON w1.id = m.winner_id
INNER JOIN Wrestlers w2 ON w2.id = m.loser_id
INNER JOIN Cards c ON c.id = m.card_id
INNER JOIN Locations l ON l.id = c.location_id
INNER JOIN Events e ON e.id = c.event_id
INNER JOIN Promotions p ON p.id = c.promotion_id
WHERE
p.name = 'NXT'
AND m.duration <> ''
AND b.name <> ''
AND b.name NOT IN (
SELECT name
FROM Belts
WHERE name LIKE '%title change%'
)
),
Rank1 AS (
SELECT
titles,
match_duration,
matches,
win_type,
location,
event
FROM
MatchDetails
WHERE
rank = 1
)
SELECT
SUBSTR(matches, 1, INSTR(matches, ' vs ') - 1) AS wrestler1,
SUBSTR(matches, INSTR(matches, ' vs ') + 4) AS wrestler2
FROM
Rank1
ORDER BY match_duration
LIMIT 1
-- Step 1: Calculate players' total runs in each match
WITH player_runs AS (
SELECT
bbb.striker AS player_id,
bbb.match_id,
SUM(bsc.runs_scored) AS total_runs
FROM
ball_by_ball AS bbb
JOIN
batsman_scored AS bsc
ON
bbb.match_id = bsc.match_id
AND bbb.over_id = bsc.over_id
AND bbb.ball_id = bsc.ball_id
AND bbb.innings_no = bsc.innings_no
GROUP BY
bbb.striker, bbb.match_id
HAVING
SUM(bsc.runs_scored) >= 100
),
-- Step 2: Identify losing teams for each match
losing_teams AS (
SELECT
match_id,
CASE
WHEN match_winner = team_1 THEN team_2
ELSE team_1
END AS loser
FROM
match
),
-- Step 3: Combine the above results to get players who scored 100 or more runs in losing teams
players_in_losing_teams AS (
SELECT
pr.player_id,
pr.match_id
FROM
player_runs AS pr
JOIN
losing_teams AS lt
ON
pr.match_id = lt.match_id
JOIN
player_match AS pm
ON
pr.player_id = pm.player_id
AND pr.match_id = pm.match_id
AND lt.loser = pm.team_id
)
-- Step 4: Select distinct player names from the player table
SELECT DISTINCT
p.player_name
FROM
player AS p
JOIN
players_in_losing_teams AS plt
ON
p.player_id = plt.player_id
ORDER BY
p.player_name;
WITH runs_scored AS (
SELECT
bb.striker AS player_id,
bb.match_id,
bs.runs_scored AS runs
FROM
ball_by_ball AS bb
JOIN
batsman_scored AS bs ON bb.match_id = bs.match_id
AND bb.over_id = bs.over_id
AND bb.ball_id = bs.ball_id
AND bb.innings_no = bs.innings_no
WHERE
bb.match_id IN (SELECT match_id FROM match WHERE season_id = 5)
),
total_runs AS (
SELECT
player_id,
match_id,
SUM(runs) AS total_runs
FROM
runs_scored
GROUP BY
player_id, match_id
),
batting_averages AS (
SELECT
player_id,
SUM(total_runs) AS runs,
COUNT(match_id) AS num_matches,
ROUND(SUM(total_runs) / CAST(COUNT(match_id) AS FLOAT), 3) AS batting_avg
FROM
total_runs
GROUP BY
player_id
ORDER BY
batting_avg DESC
LIMIT 5
)
SELECT
p.player_name,
b.batting_avg
FROM
player AS p
JOIN
batting_averages AS b ON p.player_id = b.player_id
ORDER BY
b.batting_avg DESC;
WITH customer_orders AS (
SELECT
c.customer_unique_id,
COUNT(o.order_id) AS Total_Orders_By_Customers,
AVG(p.payment_value) AS Average_Payment_By_Customer,
c.customer_city,
c.customer_state
FROM olist_customers c
JOIN olist_orders o ON c.customer_id = o.customer_id
JOIN olist_order_payments p ON o.order_id = p.order_id
WHERE o.order_status = 'delivered'
GROUP BY c.customer_unique_id, c.customer_city, c.customer_state
)
SELECT
Average_Payment_By_Customer,
customer_city,
customer_state
FROM customer_orders
ORDER BY Total_Orders_By_Customers DESC
LIMIT 3;
SELECT
actor.first_name || ' ' || actor.last_name AS full_name
FROM
actor
INNER JOIN film_actor ON actor.actor_id = film_actor.actor_id
INNER JOIN film ON film_actor.film_id = film.film_id
INNER JOIN film_category ON film.film_id = film_category.film_id
INNER JOIN category ON film_category.category_id = category.category_id
-- Join with the language table
INNER JOIN language ON film.language_id = language.language_id
WHERE
category.name = 'Children' AND
film.release_year BETWEEN 2000 AND 2010 AND
film.rating IN ('G', 'PG') AND
language.name = 'English' AND
film.length <= 120
GROUP BY
actor.actor_id, actor.first_name, actor.last_name
ORDER BY
COUNT(film.film_id) DESC
LIMIT 1;
SELECT
category.name
FROM
category
INNER JOIN film_category USING (category_id)
INNER JOIN film USING (film_id)
INNER JOIN inventory USING (film_id)
INNER JOIN rental USING (inventory_id)
INNER JOIN customer USING (customer_id)
INNER JOIN address USING (address_id)
INNER JOIN city USING (city_id)
WHERE
LOWER(city.city) LIKE 'a%' OR city.city LIKE '%-%'
GROUP BY
category.name
ORDER BY
SUM(CAST((julianday(rental.return_date) - julianday(rental.rental_date)) * 24 AS INTEGER)) DESC
LIMIT
1;
WITH UniqueProducts2020 AS (
SELECT
dp.segment,
COUNT(DISTINCT fsm.product_code) AS unique_products_2020
FROM
hardware_fact_sales_monthly fsm
JOIN
hardware_dim_product dp ON fsm.product_code = dp.product_code
WHERE
fsm.fiscal_year = 2020
GROUP BY
dp.segment
),
UniqueProducts2021 AS (
SELECT
dp.segment,
COUNT(DISTINCT fsm.product_code) AS unique_products_2021
FROM
hardware_fact_sales_monthly fsm
JOIN
hardware_dim_product dp ON fsm.product_code = dp.product_code
WHERE
fsm.fiscal_year = 2021
GROUP BY
dp.segment
)
SELECT
spc.segment,
spc.unique_products_2020 AS product_count_2020
FROM
UniqueProducts2020 spc
JOIN
UniqueProducts2021 fup ON spc.segment = fup.segment
ORDER BY
((fup.unique_products_2021 - spc.unique_products_2020) * 100.0) / (spc.unique_products_2020) DESC;
WITH get_extras_count AS (
WITH RECURSIVE split_extras AS (
SELECT
order_id,
TRIM(SUBSTR(extras, 1, INSTR(extras || ',', ',') - 1)) AS each_extra,
SUBSTR(extras || ',', INSTR(extras || ',', ',') + 1) AS remaining_extras
FROM
pizza_clean_customer_orders
UNION ALL
SELECT
order_id,
TRIM(SUBSTR(remaining_extras, 1, INSTR(remaining_extras, ',') - 1)) AS each_extra,
SUBSTR(remaining_extras, INSTR(remaining_extras, ',') + 1)
FROM
split_extras
WHERE
remaining_extras <> ''
)
SELECT
order_id,
COUNT(each_extra) AS total_extras
FROM
split_extras
GROUP BY
order_id
),
calculate_totals AS (
SELECT
t1.order_id,
t1.pizza_id,
SUM(
CASE
WHEN pizza_id = 1 THEN 12
WHEN pizza_id = 2 THEN 10
END
) AS total_price,
t3.total_extras
FROM
pizza_clean_customer_orders AS t1
JOIN
pizza_clean_runner_orders AS t2
ON
t2.order_id = t1.order_id
LEFT JOIN
get_extras_count AS t3
ON
t3.order_id = t1.order_id
WHERE
t2.cancellation IS NULL
GROUP BY
t1.order_id,
t1.pizza_id,
t3.total_extras
)
SELECT
SUM(total_price) + SUM(total_extras) AS total_income
FROM
calculate_totals;
WITH cte_cleaned_customer_orders AS (
SELECT
*,
ROW_NUMBER() OVER () AS original_row_number
FROM
pizza_clean_customer_orders
),
split_regular_toppings AS (
SELECT
pizza_id,
TRIM(SUBSTR(toppings, 1, INSTR(toppings || ',', ',') - 1)) AS topping_id,
SUBSTR(toppings || ',', INSTR(toppings || ',', ',') + 1) AS remaining_toppings
FROM
pizza_recipes
UNION ALL
SELECT
pizza_id,
TRIM(SUBSTR(remaining_toppings, 1, INSTR(remaining_toppings, ',') - 1)) AS topping_id,
SUBSTR(remaining_toppings, INSTR(remaining_toppings, ',') + 1) AS remaining_toppings
FROM
split_regular_toppings
WHERE
remaining_toppings <> ''
),
cte_base_toppings AS (
SELECT
t1.order_id,
t1.customer_id,
t1.pizza_id,
t1.order_time,
t1.original_row_number,
t2.topping_id
FROM
cte_cleaned_customer_orders AS t1
LEFT JOIN
split_regular_toppings AS t2
ON
t1.pizza_id = t2.pizza_id
),
split_exclusions AS (
SELECT
order_id,
customer_id,
pizza_id,
order_time,
original_row_number,
TRIM(SUBSTR(exclusions, 1, INSTR(exclusions || ',', ',') - 1)) AS topping_id,
SUBSTR(exclusions || ',', INSTR(exclusions || ',', ',') + 1) AS remaining_exclusions
FROM
cte_cleaned_customer_orders
WHERE
exclusions IS NOT NULL
UNION ALL
SELECT
order_id,
customer_id,
pizza_id,
order_time,
original_row_number,
TRIM(SUBSTR(remaining_exclusions, 1, INSTR(remaining_exclusions, ',') - 1)) AS topping_id,
SUBSTR(remaining_exclusions, INSTR(remaining_exclusions, ',') + 1) AS remaining_exclusions
FROM
split_exclusions
WHERE
remaining_exclusions <> ''
),
split_extras AS (
SELECT
order_id,
customer_id,
pizza_id,
order_time,
original_row_number,
TRIM(SUBSTR(extras, 1, INSTR(extras || ',', ',') - 1)) AS topping_id,
SUBSTR(extras || ',', INSTR(extras || ',', ',') + 1) AS remaining_extras
FROM
cte_cleaned_customer_orders
WHERE
extras IS NOT NULL
UNION ALL
SELECT
order_id,
customer_id,
pizza_id,
order_time,
original_row_number,
TRIM(SUBSTR(remaining_extras, 1, INSTR(remaining_extras, ',') - 1)) AS topping_id,
SUBSTR(remaining_extras, INSTR(remaining_extras, ',') + 1) AS remaining_extras
FROM
split_extras
WHERE
remaining_extras <> ''
),
cte_combined_orders AS (
SELECT
order_id,
customer_id,
pizza_id,
order_time,
original_row_number,
topping_id
FROM
cte_base_toppings
WHERE topping_id NOT IN (SELECT topping_id FROM split_exclusions WHERE split_exclusions.order_id = cte_base_toppings.order_id)
UNION ALL
SELECT
order_id,
customer_id,
pizza_id,
order_time,
original_row_number,
topping_id
FROM
split_extras
)
SELECT
t2.topping_name,
COUNT(*) AS topping_count
FROM
cte_combined_orders AS t1
JOIN
pizza_toppings AS t2
ON
t1.topping_id = t2.topping_id
GROUP BY
t2.topping_name
ORDER BY
topping_count DESC;
WITH product_viewed AS (
SELECT
t1.page_id,
SUM(CASE WHEN event_type = 1 THEN 1 ELSE 0 END) AS n_page_views,
SUM(CASE WHEN event_type = 2 THEN 1 ELSE 0 END) AS n_added_to_cart
FROM
shopping_cart_page_hierarchy AS t1
JOIN
shopping_cart_events AS t2
ON
t1.page_id = t2.page_id
WHERE
t1.product_id IS NOT NULL
GROUP BY
t1.page_id
),
product_purchased AS (
SELECT
t2.page_id,
SUM(CASE WHEN event_type = 2 THEN 1 ELSE 0 END) AS purchased_from_cart
FROM
shopping_cart_page_hierarchy AS t1
JOIN
shopping_cart_events AS t2
ON
t1.page_id = t2.page_id
WHERE
t1.product_id IS NOT NULL
AND EXISTS (
SELECT
visit_id
FROM
shopping_cart_events
WHERE
event_type = 3
AND t2.visit_id = visit_id
)
AND t1.page_id NOT IN (1, 2, 12, 13)
GROUP BY
t2.page_id
),
product_abandoned AS (
SELECT
t2.page_id,
SUM(CASE WHEN event_type = 2 THEN 1 ELSE 0 END) AS abandoned_in_cart
FROM
shopping_cart_page_hierarchy AS t1
JOIN
shopping_cart_events AS t2
ON
t1.page_id = t2.page_id
WHERE
t1.product_id IS NOT NULL
AND NOT EXISTS (
SELECT
visit_id
FROM
shopping_cart_events
WHERE
event_type = 3
AND t2.visit_id = visit_id
)
AND t1.page_id NOT IN (1, 2, 12, 13)
GROUP BY
t2.page_id
)
SELECT
t1.page_id,
t1.page_name,
t2.n_page_views AS 'number of product being viewed',
t2.n_added_to_cart AS 'number added to the cart',
t4.abandoned_in_cart AS 'without being purchased in cart',
t3.purchased_from_cart AS 'count of actual purchases'
FROM
shopping_cart_page_hierarchy AS t1
JOIN
product_viewed AS t2
ON
t2.page_id = t1.page_id
JOIN
product_purchased AS t3
ON
t3.page_id = t1.page_id
JOIN
product_abandoned AS t4
ON
t4.page_id = t1.page_id;
WITH get_interest_rank AS (
SELECT
t1.month_year,
t2.interest_name,
t1.composition,
RANK() OVER (
PARTITION BY t2.interest_name
ORDER BY t1.composition DESC
) AS interest_rank
FROM
interest_metrics AS t1
JOIN
interest_map AS t2
ON
t1.interest_id = t2.id
WHERE
t1.month_year IS NOT NULL
),
get_top_10 AS (
SELECT
month_year,
interest_name,
composition
FROM
get_interest_rank
WHERE
interest_rank = 1
ORDER BY
composition DESC
LIMIT 10
),
get_bottom_10 AS (
SELECT
month_year,
interest_name,
composition
FROM
get_interest_rank
WHERE
interest_rank = 1
ORDER BY
composition ASC
LIMIT 10
)
SELECT *
FROM
get_top_10
UNION
SELECT *
FROM
get_bottom_10
ORDER BY
composition DESC;
WITH YASH_CHOPRAS_PID AS (
SELECT
TRIM(P.PID) AS PID
FROM
Person P
WHERE
TRIM(P.Name) = 'Yash Chopra'
),
NUM_OF_MOV_BY_ACTOR_DIRECTOR AS (
SELECT
TRIM(MC.PID) AS ACTOR_PID,
TRIM(MD.PID) AS DIRECTOR_PID,
COUNT(DISTINCT TRIM(MD.MID)) AS NUM_OF_MOV
FROM
M_Cast MC
JOIN
M_Director MD ON TRIM(MC.MID) = TRIM(MD.MID)
GROUP BY
ACTOR_PID,
DIRECTOR_PID
),
NUM_OF_MOVIES_BY_YC AS (
SELECT
NM.ACTOR_PID,
NM.DIRECTOR_PID,
NM.NUM_OF_MOV AS NUM_OF_MOV_BY_YC
FROM
NUM_OF_MOV_BY_ACTOR_DIRECTOR NM
JOIN
YASH_CHOPRAS_PID YCP ON NM.DIRECTOR_PID = YCP.PID
),
MAX_MOV_BY_OTHER_DIRECTORS AS (
SELECT
ACTOR_PID,
MAX(NUM_OF_MOV) AS MAX_NUM_OF_MOV
FROM
NUM_OF_MOV_BY_ACTOR_DIRECTOR NM
JOIN
YASH_CHOPRAS_PID YCP ON NM.DIRECTOR_PID <> YCP.PID
GROUP BY
ACTOR_PID
),
ACTORS_MOV_COMPARISION AS (
SELECT
NMY.ACTOR_PID,
CASE WHEN NMY.NUM_OF_MOV_BY_YC > IFNULL(NMO.MAX_NUM_OF_MOV, 0) THEN 'Y' ELSE 'N' END AS MORE_MOV_BY_YC
FROM
NUM_OF_MOVIES_BY_YC NMY
LEFT OUTER JOIN
MAX_MOV_BY_OTHER_DIRECTORS NMO ON NMY.ACTOR_PID = NMO.ACTOR_PID
)
SELECT
COUNT(DISTINCT TRIM(P.PID)) AS "Number of actor"
FROM
Person P
WHERE
TRIM(P.PID) IN (
SELECT
DISTINCT ACTOR_PID
FROM
ACTORS_MOV_COMPARISION
WHERE
MORE_MOV_BY_YC = 'Y'
);
SELECT
Musical_Styles.StyleName,
COUNT(RankedPreferences.FirstStyle)
AS FirstPreference,
COUNT(RankedPreferences.SecondStyle)
AS SecondPreference,
COUNT(RankedPreferences.ThirdStyle)
AS ThirdPreference
FROM Musical_Styles,
(SELECT (CASE WHEN
Musical_Preferences.PreferenceSeq = 1
THEN Musical_Preferences.StyleID
ELSE Null END) As FirstStyle,
(CASE WHEN
Musical_Preferences.PreferenceSeq = 2
THEN Musical_Preferences.StyleID
ELSE Null END) As SecondStyle,
(CASE WHEN
Musical_Preferences.PreferenceSeq = 3
THEN Musical_Preferences.StyleID
ELSE Null END) AS ThirdStyle
FROM Musical_Preferences) AS RankedPreferences
WHERE Musical_Styles.StyleID =
RankedPreferences.FirstStyle
OR Musical_Styles.StyleID =
RankedPreferences.SecondStyle
OR Musical_Styles.StyleID =
RankedPreferences.ThirdStyle
GROUP BY StyleID, StyleName
HAVING COUNT(FirstStyle) > 0
OR COUNT(SecondStyle) > 0
OR COUNT(ThirdStyle) > 0
ORDER BY FirstPreference DESC,
SecondPreference DESC,
ThirdPreference DESC, StyleID;
WITH AvgSalaries AS (
SELECT
facrank AS FacRank,
AVG(facsalary) AS AvSalary
FROM
university_faculty
GROUP BY
facrank
),
SalaryDifferences AS (
SELECT
university_faculty.facrank AS FacRank,
university_faculty.facfirstname AS FacFirstName,
university_faculty.faclastname AS FacLastName,
university_faculty.facsalary AS Salary,
ABS(university_faculty.facsalary - AvgSalaries.AvSalary) AS Diff
FROM
university_faculty
JOIN
AvgSalaries ON university_faculty.facrank = AvgSalaries.FacRank
),
MinDifferences AS (
SELECT
FacRank,
MIN(Diff) AS MinDiff
FROM
SalaryDifferences
GROUP BY
FacRank
)
SELECT
s.FacRank,
s.FacFirstName,
s.FacLastName,
s.Salary
FROM
SalaryDifferences s
JOIN
MinDifferences m ON s.FacRank = m.FacRank AND s.Diff = m.MinDiff;
WITH result_table AS (
SELECT
strftime('%m', pm.payment_date) AS pay_mon,
customer_id,
COUNT(pm.amount) AS pay_countpermon,
SUM(pm.amount) AS pay_amount
FROM
payment AS pm
GROUP BY
pay_mon,
customer_id
),
top10_customer AS (
SELECT
customer_id,
SUM(tb.pay_amount) AS total_payments
FROM
result_table AS tb
GROUP BY
customer_id
ORDER BY
SUM(tb.pay_amount) DESC
LIMIT
10
),
difference_per_mon AS (
SELECT
pay_mon AS month_number,
pay_mon AS month,
tb.pay_countpermon,
tb.pay_amount,
ABS(tb.pay_amount - LAG(tb.pay_amount) OVER (PARTITION BY tb.customer_id)) AS diff
FROM
result_table tb
JOIN top10_customer top ON top.customer_id = tb.customer_id
)
SELECT
month,
ROUND(max_diff, 2) AS max_diff
FROM (
SELECT
month,
diff,
month_number,
MAX(diff) OVER (PARTITION BY month) AS max_diff
FROM
difference_per_mon
) AS max_per_mon
WHERE
diff = max_diff
ORDER BY
max_diff DESC
LIMIT
1;
WITH result_table AS (
SELECT
strftime('%Y', RE.RENTAL_DATE) AS YEAR,
strftime('%m', RE.RENTAL_DATE) AS RENTAL_MONTH,
ST.STORE_ID,
COUNT(RE.RENTAL_ID) AS count
FROM
RENTAL RE
JOIN STAFF ST ON RE.STAFF_ID = ST.STAFF_ID
GROUP BY
YEAR,
RENTAL_MONTH,
ST.STORE_ID
),
monthly_sales AS (
SELECT
YEAR,
RENTAL_MONTH,
STORE_ID,
SUM(count) AS total_rentals
FROM
result_table
GROUP BY
YEAR,
RENTAL_MONTH,
STORE_ID
),
store_max_sales AS (
SELECT
STORE_ID,
YEAR,
RENTAL_MONTH,
total_rentals,
MAX(total_rentals) OVER (PARTITION BY STORE_ID) AS max_rentals
FROM
monthly_sales
)
SELECT
STORE_ID,
YEAR,
RENTAL_MONTH,
total_rentals
FROM
store_max_sales
WHERE
total_rentals = max_rentals
ORDER BY
STORE_ID;
WITH february_orders AS (
SELECT
h.hub_name AS hub_name,
COUNT(*) AS orders_february
FROM
orders o
LEFT JOIN
stores s ON o.store_id = s.store_id
LEFT JOIN
hubs h ON s.hub_id = h.hub_id
WHERE o.order_created_month = 2 AND o.order_status = 'FINISHED'
GROUP BY
h.hub_name
),
march_orders AS (
SELECT
h.hub_name AS hub_name,
COUNT(*) AS orders_march
FROM
orders o
LEFT JOIN
stores s ON o.store_id = s.store_id
LEFT JOIN
hubs h ON s.hub_id = h.hub_id
WHERE o.order_created_month = 3 AND o.order_status = 'FINISHED'
GROUP BY
h.hub_name
)
SELECT
fo.hub_name
FROM
february_orders fo
LEFT JOIN
march_orders mo ON fo.hub_name = mo.hub_name
WHERE
fo.orders_february > 0 AND
mo.orders_march > 0 AND
(CAST((mo.orders_march - fo.orders_february) AS REAL) / CAST(fo.orders_february AS REAL)) > 0.2 -- Filter for hubs with more than a 20% increase
WITH match_view AS(
SELECT
M.id,
L.name AS league,
M.season,
M.match_api_id,
T.team_long_name AS home_team,
TM.team_long_name AS away_team,
M.home_team_goal,
M.away_team_goal,
P1.player_name AS home_gk,
P2.player_name AS home_center_back_1,
P3.player_name AS home_center_back_2,
P4.player_name AS home_right_back,
P5.player_name AS home_left_back,
P6.player_name AS home_midfield_1,
P7.player_name AS home_midfield_2,
P8.player_name AS home_midfield_3,
P9.player_name AS home_midfield_4,
P10.player_name AS home_second_forward,
P11.player_name AS home_center_forward,
P12.player_name AS away_gk,
P13.player_name AS away_center_back_1,
P14.player_name AS away_center_back_2,
P15.player_name AS away_right_back,
P16.player_name AS away_left_back,
P17.player_name AS away_midfield_1,
P18.player_name AS away_midfield_2,
P19.player_name AS away_midfield_3,
P20.player_name AS away_midfield_4,
P21.player_name AS away_second_forward,
P22.player_name AS away_center_forward,
M.goal,
M.card
FROM
match M
LEFT JOIN
league L ON M.league_id = L.id
LEFT JOIN
team T ON M.home_team_api_id = T.team_api_id
LEFT JOIN
team TM ON M.away_team_api_id = TM.team_api_id
LEFT JOIN
player P1 ON M.home_player_1 = P1.player_api_id
LEFT JOIN
player P2 ON M.home_player_2 = P2.player_api_id
LEFT JOIN
player P3 ON M.home_player_3 = P3.player_api_id
LEFT JOIN
player P4 ON M.home_player_4 = P4.player_api_id
LEFT JOIN
player P5 ON M.home_player_5 = P5.player_api_id
LEFT JOIN
player P6 ON M.home_player_6 = P6.player_api_id
LEFT JOIN
player P7 ON M.home_player_7 = P7.player_api_id
LEFT JOIN
player P8 ON M.home_player_8 = P8.player_api_id
LEFT JOIN
player P9 ON M.home_player_9 = P9.player_api_id
LEFT JOIN
player P10 ON M.home_player_10 = P10.player_api_id
LEFT JOIN
player P11 ON M.home_player_11 = P11.player_api_id
LEFT JOIN
player P12 ON M.away_player_1 = P12.player_api_id
LEFT JOIN
player P13 ON M.away_player_2 = P13.player_api_id
LEFT JOIN
player P14 ON M.away_player_3 = P14.player_api_id
LEFT JOIN
player P15 ON M.away_player_4 = P15.player_api_id
LEFT JOIN
player P16 ON M.away_player_5 = P16.player_api_id
LEFT JOIN
player P17 ON M.away_player_6 = P17.player_api_id
LEFT JOIN
player P18 ON M.away_player_7 = P18.player_api_id
LEFT JOIN
player P19 ON M.away_player_8 = P19.player_api_id
LEFT JOIN
player P20 ON M.away_player_9 = P20.player_api_id
LEFT JOIN
player P21 ON M.away_player_10 = P21.player_api_id
LEFT JOIN
player P22 ON M.away_player_11 = P22.player_api_id
),
match_score AS
(
SELECT -- Displaying teams and their goals as home_team
id,
home_team AS team,
CASE
WHEN home_team_goal > away_team_goal THEN 1 ELSE 0 END AS Winning_match
FROM
match_view
UNION ALL
SELECT -- Displaying teams and their goals as away_team
id,
away_team AS team,
CASE
WHEN away_team_goal > home_team_goal THEN 1 ELSE 0 END AS Winning_match
FROM
match_view
),
winning_matches AS
(
SELECT -- Displaying total match wins for each team
MV.league,
M.team,
COUNT(CASE WHEN M.Winning_match = 1 THEN 1 END) AS wins,
ROW_NUMBER() OVER(PARTITION BY MV.league ORDER BY COUNT(CASE WHEN M.Winning_match = 1 THEN 1 END) ASC) AS rn
FROM
match_score M
JOIN
match_view MV
ON
M.id = MV.id
GROUP BY
MV.league,
team
ORDER BY
league,
wins ASC
)
SELECT
league,
team
FROM
winning_matches
WHERE
rn = 1 -- Getting the team with the least number of wins in each league
ORDER BY
league;
SELECT
before_effect,
after_effect,
after_effect - before_effect AS change_amount,
ROUND(((after_effect * 1.0 / before_effect) - 1) * 100, 2) AS percent_change,
'2018' AS year
FROM (
SELECT
SUM(CASE WHEN delta_weeks BETWEEN 1 AND 4 THEN sales END) AS after_effect,
SUM(CASE WHEN delta_weeks BETWEEN -3 AND 0 THEN sales END) AS before_effect
FROM (
SELECT
week_date,
ROUND((JULIANDAY(week_date) - JULIANDAY('2018-06-15')) / 7.0) + 1 AS delta_weeks,
sales
FROM cleaned_weekly_sales
) add_delta_weeks
) AS add_before_after
UNION ALL
SELECT
before_effect,
after_effect,
after_effect - before_effect AS change_amount,
ROUND(((after_effect * 1.0 / before_effect) - 1) * 100, 2) AS percent_change,
'2019' AS year
FROM (
SELECT
SUM(CASE WHEN delta_weeks BETWEEN 1 AND 4 THEN sales END) AS after_effect,
SUM(CASE WHEN delta_weeks BETWEEN -3 AND 0 THEN sales END) AS before_effect
FROM (
SELECT
week_date,
ROUND((JULIANDAY(week_date) - JULIANDAY('2019-06-15')) / 7.0) + 1 AS delta_weeks,
sales
FROM cleaned_weekly_sales
) add_delta_weeks
) AS add_before_after
UNION ALL
SELECT
before_effect,
after_effect,
after_effect - before_effect AS change_amount,
ROUND(((after_effect * 1.0 / before_effect) - 1) * 100, 2) AS percent_change,
'2020' AS year
FROM (
SELECT
SUM(CASE WHEN delta_weeks BETWEEN 1 AND 4 THEN sales END) AS after_effect,
SUM(CASE WHEN delta_weeks BETWEEN -3 AND 0 THEN sales END) AS before_effect
FROM (
SELECT
week_date,
ROUND((JULIANDAY(week_date) - JULIANDAY('2020-06-15')) / 7.0) + 1 AS delta_weeks,
sales
FROM cleaned_weekly_sales
) add_delta_weeks
) AS add_before_after
ORDER BY year;
with year_points as (
select races.year,
drivers.forename || ' ' || drivers.surname as driver,
constructors.name as constructor,
sum(results.points) as points
from results
left join races on results.race_id = races.race_id -- Ensure these columns exist in your schema
left join drivers on results.driver_id = drivers.driver_id -- Ensure these columns exist in your schema
left join constructors on results.constructor_id = constructors.constructor_id -- Ensure these columns exist in your schema
group by races.year, driver
union
select races.year,
null as driver,
constructors.name as constructor,
sum(results.points) as points
from results
left join races on results.race_id = races.race_id -- Ensure these columns exist in your schema
left join drivers on results.driver_id = drivers.driver_id -- Ensure these columns exist in your schema
left join constructors on results.constructor_id = constructors.constructor_id -- Ensure these columns exist in your schema
group by races.year, constructor
),
max_points as (
select year,
max(case when driver is not null then points else null end) as max_driver_points,
max(case when constructor is not null then points else null end) as max_constructor_points
from year_points
group by year
)
select max_points.year,
drivers_year_points.driver,
constructors_year_points.constructor
from max_points
left join year_points as drivers_year_points on
max_points.year = drivers_year_points.year and
max_points.max_driver_points = drivers_year_points.points and
drivers_year_points.driver is not null
left join year_points as constructors_year_points on
max_points.year = constructors_year_points.year and
max_points.max_constructor_points = constructors_year_points.points and
constructors_year_points.constructor is not null
order by max_points.year;
WITH timestamps AS
(
SELECT
DATE_TRUNC(year,DATEADD(year,-1,DATE '2024-08-29')) AS ref_timestamp,
LAST_DAY(DATEADD(week,2 + CAST(WEEKISO(ref_timestamp) != 1 AS INTEGER),ref_timestamp),week) AS end_week,
DATEADD(day, day_num - 7, end_week) AS date_valid_std
FROM
(
SELECT
ROW_NUMBER() OVER (ORDER BY SEQ1()) AS day_num
FROM
TABLE(GENERATOR(rowcount => 7))
)
)
SELECT
country,
postal_code,
date_valid_std,
tot_snowfall_in
FROM
GLOBAL_WEATHER__CLIMATE_DATA_FOR_BI.standard_tile.history_day
NATURAL INNER JOIN
timestamps
WHERE
country='US' AND
tot_snowfall_in > 6.0
ORDER BY
postal_code,date_valid_std
;
WITH big_banks AS (
SELECT id_rssd
FROM FINANCE__ECONOMICS.CYBERSYN.financial_institution_timeseries
WHERE variable = 'ASSET'
AND date = '2022-12-31'
AND value > 1E10
)
SELECT name
FROM FINANCE__ECONOMICS.CYBERSYN.financial_institution_timeseries AS ts
INNER JOIN FINANCE__ECONOMICS.CYBERSYN.financial_institution_attributes AS att ON (ts.variable = att.variable)
INNER JOIN FINANCE__ECONOMICS.CYBERSYN.financial_institution_entities AS ent ON (ts.id_rssd = ent.id_rssd)
INNER JOIN big_banks ON (big_banks.id_rssd = ts.id_rssd)
WHERE ts.date = '2022-12-31'
AND att.variable_name = '% Insured (Estimated)'
AND att.frequency = 'Quarterly'
AND ent.is_active = True
ORDER BY (1 - value) DESC
LIMIT 10;
WITH TractPop AS (
SELECT
CG."BlockGroupID",
FCV."CensusValue",
CG."StateCountyTractID",
CG."BlockGroupPolygon"
FROM
CENSUS_GALAXY__ZIP_CODE_TO_BLOCK_GROUP_SAMPLE.PUBLIC."Dim_CensusGeography" CG
JOIN
CENSUS_GALAXY__ZIP_CODE_TO_BLOCK_GROUP_SAMPLE.PUBLIC."Fact_CensusValues_ACS2021" FCV
ON CG."BlockGroupID" = FCV."BlockGroupID"
WHERE
CG."StateAbbrev" = 'NY'
AND FCV."MetricID" = 'B01003_001E'
),
TractGroup AS (
SELECT
CG."StateCountyTractID",
SUM(FCV."CensusValue") AS "TotalTractPop"
FROM
CENSUS_GALAXY__ZIP_CODE_TO_BLOCK_GROUP_SAMPLE.PUBLIC."Dim_CensusGeography" CG
JOIN
CENSUS_GALAXY__ZIP_CODE_TO_BLOCK_GROUP_SAMPLE.PUBLIC."Fact_CensusValues_ACS2021" FCV
ON CG."BlockGroupID" = FCV."BlockGroupID"
WHERE
CG."StateAbbrev" = 'NY'
AND FCV."MetricID" = 'B01003_001E'
GROUP BY
CG."StateCountyTractID"
)
SELECT
TP."BlockGroupID",
TP."CensusValue",
TP."StateCountyTractID",
TG."TotalTractPop",
CASE WHEN TG."TotalTractPop" <> 0 THEN TP."CensusValue" / TG."TotalTractPop" ELSE 0 END AS "BlockGroupRatio"
FROM
TractPop TP
JOIN
TractGroup TG
ON TP."StateCountyTractID" = TG."StateCountyTractID";
SELECT
YEAR(claims.date_of_loss) AS year_of_loss,
claims.nfip_community_name,
SUM(claims.building_damage_amount) AS total_building_damage_amount,
SUM(claims.contents_damage_amount) AS total_contents_damage_amount
FROM WEATHER__ENVIRONMENT.CYBERSYN.fema_national_flood_insurance_program_claim_index claims
WHERE
claims.nfip_community_name = 'City Of New York'
AND year_of_loss >=2010 AND year_of_loss <=2019
GROUP BY year_of_loss, claims.nfip_community_name
ORDER BY year_of_loss, claims.nfip_community_name;
WITH Commuters AS (
SELECT
GE."ZipCode",
SUM(CASE WHEN M."MetricID" = 'B08303_013E' THEN F."CensusValueByZip" ELSE 0 END +
CASE WHEN M."MetricID" = 'B08303_012E' THEN F."CensusValueByZip" ELSE 0 END) AS "Num_Commuters_1Hr_Travel_Time"
FROM
CENSUS_GALAXY__AIML_MODEL_DATA_ENRICHMENT_SAMPLE.PUBLIC."LU_GeographyExpanded" GE
JOIN
CENSUS_GALAXY__AIML_MODEL_DATA_ENRICHMENT_SAMPLE.PUBLIC."Fact_CensusValues_ACS2021_ByZip" F
ON GE."ZipCode" = F."ZipCode"
JOIN
CENSUS_GALAXY__AIML_MODEL_DATA_ENRICHMENT_SAMPLE.PUBLIC."Dim_CensusMetrics" M
ON F."MetricID" = M."MetricID"
WHERE
GE."PreferredStateAbbrev" = 'NY'
AND (M."MetricID" = 'B08303_013E' OR M."MetricID" = 'B08303_012E') -- Metric IDs for commuters with 1+ hour travel time
GROUP BY
GE."ZipCode"
),
StateBenchmark AS (
SELECT
SB."StateAbbrev",
SUM(SB."StateBenchmarkValue") AS "StateBenchmark_Over1HrTravelTime",
SB."TotalStatePopulation"
FROM
CENSUS_GALAXY__AIML_MODEL_DATA_ENRICHMENT_SAMPLE.PUBLIC."Fact_StateBenchmark_ACS2021" SB
WHERE
SB."MetricID" IN ('B08303_013E', 'B08303_012E')
AND SB."StateAbbrev" = 'NY'
GROUP BY
SB."StateAbbrev", SB."TotalStatePopulation"
)
SELECT
C."ZipCode",
SUM(C."Num_Commuters_1Hr_Travel_Time") AS "Total_Commuters_1Hr_Travel_Time",
SB."StateBenchmark_Over1HrTravelTime",
SB."TotalStatePopulation",
FROM
Commuters C
CROSS JOIN
StateBenchmark SB
GROUP BY
C."ZipCode", SB."StateBenchmark_Over1HrTravelTime", SB."TotalStatePopulation"
ORDER BY
"Total_Commuters_1Hr_Travel_Time" DESC
LIMIT 1;
WITH push_send AS (
SELECT
id,
app_group_id,
user_id,
campaign_id,
message_variation_id,
platform,
ad_tracking_enabled,
TO_TIMESTAMP(TIME) AS "TIME",
'Send' AS "EVENT_TYPE"
FROM
BRAZE_USER_EVENT_DEMO_DATASET.PUBLIC.USERS_MESSAGES_PUSHNOTIFICATION_SEND_VIEW
WHERE
TO_TIMESTAMP(TIME) BETWEEN '2023-06-01 08:00:00' AND '2023-06-01 09:00:00'
),
push_bounce AS (
SELECT
id,
app_group_id,
user_id,
campaign_id,
message_variation_id,
platform,
ad_tracking_enabled,
TO_TIMESTAMP(TIME) AS "TIME",
'Bounce' AS "EVENT_TYPE"
FROM
BRAZE_USER_EVENT_DEMO_DATASET.PUBLIC.USERS_MESSAGES_PUSHNOTIFICATION_BOUNCE_VIEW
WHERE
TO_TIMESTAMP(TIME) BETWEEN '2023-06-01 08:00:00' AND '2023-06-01 09:00:00'
),
push_open AS (
SELECT
id,
app_group_id,
user_id,
campaign_id,
message_variation_id,
platform,
ad_tracking_enabled,
TO_TIMESTAMP(TIME) AS "TIME",
'Open' AS "EVENT_TYPE",
carrier,
browser,
device_model
FROM
BRAZE_USER_EVENT_DEMO_DATASET.PUBLIC.USERS_MESSAGES_PUSHNOTIFICATION_OPEN_VIEW
WHERE
TO_TIMESTAMP(TIME) BETWEEN '2023-06-01 08:00:00' AND '2023-06-01 09:00:00'
),
push_open_influence AS (
SELECT
id,
app_group_id,
user_id,
campaign_id,
message_variation_id,
platform,
TO_TIMESTAMP(TIME) AS "TIME",
'Influenced Open' AS "EVENT_TYPE",
carrier,
browser,
device_model
FROM
BRAZE_USER_EVENT_DEMO_DATASET.PUBLIC.USERS_MESSAGES_PUSHNOTIFICATION_INFLUENCEDOPEN_VIEW
WHERE
TO_TIMESTAMP(TIME) BETWEEN '2023-06-01 08:00:00' AND '2023-06-01 09:00:00'
)
SELECT
ps.app_group_id,
ps.campaign_id,
ps.user_id,
ps.time,
po.time push_open_time,
ps.message_variation_id,
ps.platform,
ps.ad_tracking_enabled,
po.carrier,
po.browser,
po.device_model,
COUNT(
DISTINCT ps.id
) push_notification_sends,
COUNT(
DISTINCT ps.user_id
) unique_push_notification_sends,
COUNT(
DISTINCT pb.id
) push_notification_bounced,
COUNT(
DISTINCT pb.user_id
) unique_push_notification_bounced,
COUNT(
DISTINCT po.id
) push_notification_open,
COUNT(
DISTINCT po.user_id
) unique_push_notification_opened,
COUNT(
DISTINCT poi.id
) push_notification_influenced_open,
COUNT(
DISTINCT poi.user_id
) unique_push_notification_influenced_open
FROM
push_send ps
LEFT JOIN push_bounce pb
ON ps.message_variation_id = pb.message_variation_id
AND ps.user_id = pb.user_id
AND ps.app_group_id = pb.app_group_id
LEFT JOIN push_open po
ON ps.message_variation_id = po.message_variation_id
AND ps.user_id = po.user_id
AND ps.app_group_id = po.app_group_id
LEFT JOIN push_open_influence poi
ON ps.message_variation_id = poi.message_variation_id
AND ps.user_id = poi.user_id
AND ps.app_group_id = poi.app_group_id
GROUP BY
1,2,3,4,5,6,7,8,9,10,11;
WITH zip_areas AS (
SELECT
geo.geo_id,
geo.geo_name AS zip,
states.related_geo_name AS state,
countries.related_geo_name AS country,
ST_AREA(TRY_TO_GEOGRAPHY(value)) AS area
FROM US_ADDRESSES__POI.CYBERSYN.geography_index AS geo
JOIN US_ADDRESSES__POI.CYBERSYN.geography_relationships AS states
ON (geo.geo_id = states.geo_id AND states.related_level = 'State')
JOIN US_ADDRESSES__POI.CYBERSYN.geography_relationships AS countries
ON (geo.geo_id = countries.geo_id AND countries.related_level = 'Country')
JOIN US_ADDRESSES__POI.CYBERSYN.geography_characteristics AS chars
ON (geo.geo_id = chars.geo_id AND chars.relationship_type = 'coordinates_geojson')
WHERE geo.level = 'CensusZipCodeTabulationArea'
),
zip_area_ranks AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY country, state ORDER BY area DESC, geo_id) AS zip_area_rank
FROM zip_areas
)
SELECT addr.number, addr.street, addr.street_type
FROM US_ADDRESSES__POI.CYBERSYN.us_addresses AS addr
JOIN zip_area_ranks AS areas
ON (addr.id_zip = areas.geo_id)
WHERE addr.state = 'FL' AND areas.country = 'United States' AND areas.zip_area_rank = 1
ORDER BY LATITUDE DESC
LIMIT 10;
WITH ytd_performance AS (
SELECT
ticker,
MIN(date) OVER (PARTITION BY ticker) AS start_of_year_date,
FIRST_VALUE(value) OVER (PARTITION BY ticker ORDER BY date ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS start_of_year_price,
MAX(date) OVER (PARTITION BY ticker) AS latest_date,
LAST_VALUE(value) OVER (PARTITION BY ticker ORDER BY date ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS latest_price
FROM FINANCE__ECONOMICS.CYBERSYN.stock_price_timeseries
WHERE
ticker IN ('AAPL', 'MSFT', 'AMZN', 'GOOGL', 'META', 'TSLA', 'NVDA')
AND date BETWEEN DATE '2024-01-01' AND DATE '2024-06-30' -- Adjusted to cover only from the start of 2024 to the end of June 2024
AND variable_name = 'Post-Market Close'
)
SELECT
ticker,
(latest_price - start_of_year_price) / start_of_year_price * 100 AS percentage_change_ytd
FROM
ytd_performance
GROUP BY
ticker, start_of_year_date, start_of_year_price, latest_date, latest_price
ORDER BY percentage_change_ytd DESC;
WITH double_entry_book AS (
-- Debits
SELECT
"to_address" AS "address",
"value" AS "value"
FROM "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."TRACES"
WHERE "to_address" IS NOT NULL
AND "status" = 1
AND ("call_type" NOT IN ('delegatecall', 'callcode', 'staticcall') OR "call_type" IS NULL)
UNION ALL
-- Credits
SELECT
"from_address" AS "address",
- "value" AS "value"
FROM "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."TRACES"
WHERE "from_address" IS NOT NULL
AND "status" = 1
AND ("call_type" NOT IN ('delegatecall', 'callcode', 'staticcall') OR "call_type" IS NULL)
UNION ALL
-- Transaction fees debits
SELECT
"miner" AS "address",
SUM(CAST("receipt_gas_used" AS NUMBER) * CAST("gas_price" AS NUMBER)) AS "value"
FROM "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."TRANSACTIONS" AS "transactions"
JOIN "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."BLOCKS" AS "blocks"
ON "blocks"."number" = "transactions"."block_number"
GROUP BY "blocks"."miner"
UNION ALL
-- Transaction fees credits
SELECT
"from_address" AS "address",
-(CAST("receipt_gas_used" AS NUMBER) * CAST("gas_price" AS NUMBER)) AS "value"
FROM "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."TRANSACTIONS"
),
top_10_balances AS (
SELECT
"address",
SUM("value") AS "balance"
FROM double_entry_book
GROUP BY "address"
ORDER BY "balance" DESC
LIMIT 10
)
SELECT
ROUND(AVG("balance") / 1e15, 2) AS "average_balance_trillion"
FROM top_10_balances;
WITH bounding_area AS (
SELECT "geometry" AS geometry
FROM GEO_OPENSTREETMAP.GEO_OPENSTREETMAP.PLANET_FEATURES,
LATERAL FLATTEN(INPUT => planet_features."all_tags") AS "tag"
WHERE "feature_type" = 'multipolygons'
AND "tag".value:"key" = 'wikidata'
AND "tag".value:"value" = 'Q35'
),
highway_info AS (
SELECT
SUM(ST_LENGTH(
ST_GEOGRAPHYFROMWKB(planet_features."geometry")
)
) AS highway_length,
"tag".value:"value" AS highway_type
FROM
GEO_OPENSTREETMAP.GEO_OPENSTREETMAP.PLANET_FEATURES AS planet_features,
bounding_area
CROSS JOIN LATERAL FLATTEN(INPUT => planet_features."all_tags") AS "tag"
WHERE "tag".value:"key" = 'highway'
AND "feature_type" = 'lines'
AND ST_DWITHIN(
ST_GEOGFROMWKB(planet_features."geometry"),
ST_GEOGFROMWKB(bounding_area.geometry),
0.0
)
GROUP BY highway_type
)
SELECT
REPLACE(highway_type, '"', '') AS highway_type
FROM
highway_info
ORDER BY
highway_length DESC
LIMIT 5;
WITH HighestReleases AS (
SELECT
HR."Name",
HR."Version"
FROM (
SELECT
"Name",
"Version",
ROW_NUMBER() OVER (
PARTITION BY "Name"
ORDER BY
TO_NUMBER(PARSE_JSON("VersionInfo"):"Ordinal") DESC
) AS RowNumber
FROM
DEPS_DEV_V1.DEPS_DEV_V1.PACKAGEVERSIONS
WHERE
"System" = 'NPM'
AND TO_BOOLEAN(PARSE_JSON("VersionInfo"):"IsRelease") = TRUE
) AS HR
WHERE HR.RowNumber = 1
),
PVP AS (
SELECT
PVP."Name",
PVP."Version",
PVP."ProjectType",
PVP."ProjectName"
FROM
DEPS_DEV_V1.DEPS_DEV_V1.PACKAGEVERSIONTOPROJECT AS PVP
JOIN
HighestReleases AS HR
ON
PVP."Name" = HR."Name"
AND PVP."Version" = HR."Version"
WHERE
PVP."System" = 'NPM'
AND PVP."ProjectType" = 'GITHUB'
)
SELECT
PVP."Name",
PVP."Version"
FROM
PVP
JOIN
DEPS_DEV_V1.DEPS_DEV_V1.PROJECTS AS P
ON
PVP."ProjectType" = P."Type"
AND PVP."ProjectName" = P."Name"
ORDER BY
P."StarsCount" DESC
LIMIT 8;
WITH Patent_Matches AS (
SELECT
TO_DATE(CAST(ANY_VALUE(patentsdb."filing_date") AS STRING), 'YYYYMMDD') AS Patent_Filing_Date,
patentsdb."application_number" AS Patent_Application_Number,
MAX(abstract_info.value:"text") AS Patent_Title,
MAX(abstract_info.value:"language") AS Patent_Title_Language
FROM
PATENTS.PATENTS.PUBLICATIONS AS patentsdb,
LATERAL FLATTEN(input => patentsdb."abstract_localized") AS abstract_info
WHERE
LOWER(abstract_info.value:"text") LIKE '%internet of things%'
AND patentsdb."country_code" = 'US'
GROUP BY
Patent_Application_Number
),
Date_Series_Table AS (
SELECT
DATEADD(day, seq4(), DATE '2008-01-01') AS day,
0 AS Number_of_Patents
FROM
TABLE(
GENERATOR(
ROWCOUNT => 5479
)
)
ORDER BY
day
)
SELECT
TO_CHAR(Date_Series_Table.day, 'YYYY-MM') AS Patent_Date_YearMonth,
COUNT(Patent_Matches.Patent_Application_Number) AS Number_of_Patent_Applications
FROM
Date_Series_Table
LEFT JOIN Patent_Matches
ON Date_Series_Table.day = Patent_Matches.Patent_Filing_Date
WHERE
Date_Series_Table.day < DATE '2023-01-01'
GROUP BY
TO_CHAR(Date_Series_Table.day, 'YYYY-MM')
ORDER BY
Patent_Date_YearMonth;
WITH A AS (
SELECT
"reference_bases",
"start_position"
FROM
"HUMAN_GENOME_VARIANTS"."HUMAN_GENOME_VARIANTS"."_1000_GENOMES_PHASE_3_OPTIMIZED_SCHEMA_VARIANTS_20150220"
WHERE
"reference_bases" IN ('AT', 'TA')
),
B AS (
SELECT
"reference_bases",
MIN("start_position") AS "min_start_position",
MAX("start_position") AS "max_start_position",
COUNT(1) AS "total_count"
FROM
A
GROUP BY
"reference_bases"
),
min_counts AS (
SELECT
A."reference_bases", -- Explicitly referencing the column from table A
A."start_position" AS "min_start_position",
COUNT(1) AS "min_count"
FROM
A
INNER JOIN B
ON A."reference_bases" = B."reference_bases"
WHERE
A."start_position" = B."min_start_position"
GROUP BY
A."reference_bases", A."start_position"
),
max_counts AS (
SELECT
A."reference_bases", -- Explicitly referencing the column from table A
A."start_position" AS "max_start_position",
COUNT(1) AS "max_count"
FROM
A
INNER JOIN B
ON A."reference_bases" = B."reference_bases"
WHERE
A."start_position" = B."max_start_position"
GROUP BY
A."reference_bases", A."start_position"
)
SELECT
B."reference_bases", -- Explicitly referencing the column from table B
B."min_start_position",
CAST(min_counts."min_count" AS FLOAT) / B."total_count" AS "min_position_ratio",
B."max_start_position",
CAST(max_counts."max_count" AS FLOAT) / B."total_count" AS "max_position_ratio"
FROM
B
LEFT JOIN
min_counts ON B."reference_bases" = min_counts."reference_bases" AND B."min_start_position" = min_counts."min_start_position"
LEFT JOIN
max_counts ON B."reference_bases" = max_counts."reference_bases" AND B."max_start_position" = max_counts."max_start_position"
ORDER BY
B."reference_bases";
SELECT
genex."case_barcode" AS "case_barcode",
genex."sample_barcode" AS "sample_barcode",
genex."aliquot_barcode" AS "aliquot_barcode",
genex."HGNC_gene_symbol" AS "HGNC_gene_symbol",
clinical_info."Variant_Type" AS "Variant_Type",
genex."gene_id" AS "gene_id",
genex."normalized_count" AS "normalized_count",
genex."project_short_name" AS "project_short_name",
clinical_info."demo__gender" AS "gender",
clinical_info."demo__vital_status" AS "vital_status",
clinical_info."demo__days_to_death" AS "days_to_death"
FROM (
SELECT
case_list."Variant_Type" AS "Variant_Type",
case_list."case_barcode" AS "case_barcode",
clinical."demo__gender",
clinical."demo__vital_status",
clinical."demo__days_to_death"
FROM
(SELECT
mutation."case_barcode",
mutation."Variant_Type"
FROM
"TCGA"."TCGA_VERSIONED"."SOMATIC_MUTATION_HG19_DCC_2017_02" AS mutation
WHERE
mutation."Hugo_Symbol" = 'CDKN2A'
AND mutation."project_short_name" = 'TCGA-BLCA'
GROUP BY
mutation."case_barcode",
mutation."Variant_Type"
ORDER BY
mutation."case_barcode"
) AS case_list /* end case_list */
INNER JOIN
"TCGA"."TCGA_VERSIONED"."CLINICAL_GDC_R39" AS clinical
ON
case_list."case_barcode" = clinical."submitter_id" /* end clinical annotation */ ) AS clinical_info
INNER JOIN
"TCGA"."TCGA_VERSIONED"."RNASEQ_HG19_GDC_2017_02" AS genex
ON
genex."case_barcode" = clinical_info."case_barcode"
WHERE
genex."HGNC_gene_symbol" IN ('MDM2', 'TP53', 'CDKN1A','CCNE1')
ORDER BY
"case_barcode",
"HGNC_gene_symbol";
WITH data AS (
SELECT
"ZIPSTARTNAME"."borough" AS "borough_start",
"ZIPSTARTNAME"."neighborhood" AS "neighborhood_start",
"ZIPENDNAME"."borough" AS "borough_end",
"ZIPENDNAME"."neighborhood" AS "neighborhood_end",
CAST("TRI"."tripduration" / 60 AS NUMERIC) AS "trip_minutes",
"WEA"."temp" AS "temperature",
CAST("WEA"."wdsp" AS NUMERIC) AS "wind_speed",
"WEA"."prcp" AS "precipitation",
EXTRACT(MONTH FROM DATE("TRI"."starttime")) AS "start_month"
FROM
"NEW_YORK_CITIBIKE_1"."NEW_YORK_CITIBIKE"."CITIBIKE_TRIPS" AS "TRI"
INNER JOIN
"NEW_YORK_CITIBIKE_1"."GEO_US_BOUNDARIES"."ZIP_CODES" AS "ZIPSTART"
ON ST_WITHIN(
ST_POINT("TRI"."start_station_longitude", "TRI"."start_station_latitude"),
ST_GEOGFROMWKB("ZIPSTART"."zip_code_geom")
)
INNER JOIN
"NEW_YORK_CITIBIKE_1"."GEO_US_BOUNDARIES"."ZIP_CODES" AS "ZIPEND"
ON ST_WITHIN(
ST_POINT("TRI"."end_station_longitude", "TRI"."end_station_latitude"),
ST_GEOGFROMWKB("ZIPEND"."zip_code_geom")
)
INNER JOIN
"NEW_YORK_CITIBIKE_1"."NOAA_GSOD"."GSOD2014" AS "WEA"
ON TO_DATE(CONCAT("WEA"."year", LPAD("WEA"."mo", 2, '0'), LPAD("WEA"."da", 2, '0')), 'YYYYMMDD') = DATE("TRI"."starttime")
INNER JOIN
"NEW_YORK_CITIBIKE_1"."CYCLISTIC"."ZIP_CODES" AS "ZIPSTARTNAME"
ON "ZIPSTART"."zip_code" = CAST("ZIPSTARTNAME"."zip" AS STRING)
INNER JOIN
"NEW_YORK_CITIBIKE_1"."CYCLISTIC"."ZIP_CODES" AS "ZIPENDNAME"
ON "ZIPEND"."zip_code" = CAST("ZIPENDNAME"."zip" AS STRING)
WHERE
"WEA"."wban" = (
SELECT "wban"
FROM "NEW_YORK_CITIBIKE_1"."NOAA_GSOD"."STATIONS"
WHERE
"state" = 'NY'
AND LOWER("name") LIKE LOWER('%New York Central Park%')
LIMIT 1
)
AND EXTRACT(YEAR FROM DATE("TRI"."starttime")) = 2014
),
agg_data AS (
SELECT
"borough_start",
"neighborhood_start",
"borough_end",
"neighborhood_end",
COUNT(*) AS "num_trips",
ROUND(AVG("trip_minutes"), 1) AS "avg_trip_minutes",
ROUND(AVG("temperature"), 1) AS "avg_temperature",
ROUND(AVG("wind_speed"), 1) AS "avg_wind_speed",
ROUND(AVG("precipitation"), 1) AS "avg_precipitation"
FROM data
GROUP BY
"borough_start",
"neighborhood_start",
"borough_end",
"neighborhood_end"
),
most_common_months AS (
SELECT
"borough_start",
"neighborhood_start",
"borough_end",
"neighborhood_end",
"start_month",
ROW_NUMBER() OVER (
PARTITION BY "borough_start", "neighborhood_start", "borough_end", "neighborhood_end"
ORDER BY COUNT(*) DESC
) AS "row_num"
FROM data
GROUP BY
"borough_start",
"neighborhood_start",
"borough_end",
"neighborhood_end",
"start_month"
)
SELECT
a.*,
m."start_month" AS "most_common_month"
FROM
agg_data a
JOIN
most_common_months m
ON a."borough_start" = m."borough_start"
AND a."neighborhood_start" = m."neighborhood_start"
AND a."borough_end" = m."borough_end"
AND a."neighborhood_end" = m."neighborhood_end"
AND m."row_num" = 1
ORDER BY
a."neighborhood_start",
a."neighborhood_end";
SELECT
app."patent_id" AS "patent_id",
patent."title",
app."date" AS "application_date",
filterData."bkwdCitations_1",
filterData."fwrdCitations_1",
summary."text" AS "summary_text"
FROM
PATENTSVIEW.PATENTSVIEW.BRF_SUM_TEXT AS summary
JOIN
PATENTSVIEW.PATENTSVIEW.PATENT AS patent
ON summary."patent_id" = patent."id"
JOIN
PATENTSVIEW.PATENTSVIEW.APPLICATION AS app
ON app."patent_id" = summary."patent_id"
JOIN (
SELECT DISTINCT
cpc."patent_id",
IFNULL(citation_1."bkwdCitations_1", 0) AS "bkwdCitations_1",
IFNULL(citation_1."fwrdCitations_1", 0) AS "fwrdCitations_1"
FROM
PATENTSVIEW.PATENTSVIEW.CPC_CURRENT AS cpc
JOIN (
SELECT
b."patent_id",
b."bkwdCitations_1",
f."fwrdCitations_1"
FROM (
SELECT
cited."patent_id",
COUNT(*) AS "fwrdCitations_1"
FROM
PATENTSVIEW.PATENTSVIEW.USPATENTCITATION AS cited
JOIN
PATENTSVIEW.PATENTSVIEW.APPLICATION AS apps
ON cited."patent_id" = apps."patent_id"
WHERE
apps."country" = 'US'
AND cited."date" >= apps."date"
AND TRY_CAST(cited."date" AS DATE) <= DATEADD(MONTH, 1, TRY_CAST(apps."date" AS DATE)) -- Citation within 1 month
GROUP BY
cited."patent_id"
) AS f
JOIN (
SELECT
cited."patent_id",
COUNT(*) AS "bkwdCitations_1"
FROM
PATENTSVIEW.PATENTSVIEW.USPATENTCITATION AS cited
JOIN
PATENTSVIEW.PATENTSVIEW.APPLICATION AS apps
ON cited."patent_id" = apps."patent_id"
WHERE
apps."country" = 'US'
AND cited."date" < apps."date"
AND TRY_CAST(cited."date" AS DATE) >= DATEADD(MONTH, -1, TRY_CAST(apps."date" AS DATE)) -- Citation within 1 month before
GROUP BY
cited."patent_id"
) AS b
ON b."patent_id" = f."patent_id"
WHERE
b."bkwdCitations_1" IS NOT NULL
AND f."fwrdCitations_1" IS NOT NULL
AND (b."bkwdCitations_1" > 0 OR f."fwrdCitations_1" > 0)
) AS citation_1
ON cpc."patent_id" = citation_1."patent_id"
WHERE
cpc."subsection_id" = 'C05'
OR cpc."group_id" = 'A01G'
) AS filterData
ON app."patent_id" = filterData."patent_id"
ORDER BY app."date";
WITH totals AS (
-- Aggregate monthly totals for Bitcoin txs, input/output UTXOs,
-- and input/output values (UTXO stands for Unspent Transaction Output)
SELECT
"txs_tot"."block_timestamp_month" AS tx_month,
COUNT("txs_tot"."hash") AS tx_count,
SUM("txs_tot"."input_count") AS tx_inputs,
SUM("txs_tot"."output_count") AS tx_outputs,
SUM("txs_tot"."input_value") / 100000000 AS tx_input_val,
SUM("txs_tot"."output_value") / 100000000 AS tx_output_val
FROM CRYPTO.CRYPTO_BITCOIN.TRANSACTIONS AS "txs_tot"
WHERE "txs_tot"."block_timestamp_month" BETWEEN CAST('2021-01-01' AS DATE) AND CAST('2021-12-31' AS DATE)
GROUP BY "txs_tot"."block_timestamp_month"
ORDER BY "txs_tot"."block_timestamp_month" DESC
),
coinjoinOuts AS (
-- Builds a table where each row represents an output of a
-- potential CoinJoin tx, defined as a tx that had more
-- than two outputs and had a total output value less than its
-- input value, per Adam Fiscor's description in this article:
SELECT
"txs"."hash",
"txs"."block_number",
"txs"."block_timestamp_month",
"txs"."input_count",
"txs"."output_count",
"txs"."input_value",
"txs"."output_value",
"o".value:"value" AS "outputs_val"
FROM CRYPTO.CRYPTO_BITCOIN.TRANSACTIONS AS "txs",
LATERAL FLATTEN(INPUT => "txs"."outputs") AS "o"
WHERE "txs"."output_count" > 2
AND "txs"."output_value" <= "txs"."input_value"
AND "txs"."block_timestamp_month" BETWEEN CAST('2021-01-01' AS DATE) AND CAST('2021-12-31' AS DATE)
ORDER BY "txs"."block_number", "txs"."hash" DESC
),
coinjoinTxs AS (
-- Builds a table of just the distinct CoinJoin tx hashes
-- which had more than one equal-value output.
SELECT
"coinjoinouts"."hash" AS "cjhash",
"coinjoinouts"."outputs_val" AS outputVal,
COUNT(*) AS cjOuts
FROM coinjoinOuts AS "coinjoinouts"
GROUP BY "coinjoinouts"."hash", "coinjoinouts"."outputs_val"
HAVING COUNT(*) > 1
),
coinjoinsD AS (
-- Filter out all potential CoinJoin txs that did not have
-- more than one equal-value output. Do not list the
-- outputs themselves, only the distinct tx hashes and
-- their input/output counts and values.
SELECT DISTINCT
"coinjoinouts"."hash",
"coinjoinouts"."block_number",
"coinjoinouts"."block_timestamp_month",
"coinjoinouts"."input_count",
"coinjoinouts"."output_count",
"coinjoinouts"."input_value",
"coinjoinouts"."output_value"
FROM coinjoinOuts AS "coinjoinouts"
INNER JOIN coinjoinTxs AS "coinjointxs"
ON "coinjoinouts"."hash" = "coinjointxs"."cjhash"
),
coinjoins AS (
-- Aggregate monthly totals for CoinJoin txs, input/output UTXOs,
-- and input/output values
SELECT
"cjs"."block_timestamp_month" AS cjs_month,
COUNT("cjs"."hash") AS cjs_count,
SUM("cjs"."input_count") AS cjs_inputs,
SUM("cjs"."output_count") AS cjs_outputs,
SUM("cjs"."input_value") / 100000000 AS cjs_input_val,
SUM("cjs"."output_value") / 100000000 AS cjs_output_val
FROM coinjoinsD AS "cjs"
GROUP BY "cjs"."block_timestamp_month"
ORDER BY "cjs"."block_timestamp_month" DESC
)
SELECT EXTRACT(MONTH FROM tx_month) AS month,
-- Calculate resulting CoinJoin percentages:
-- tx_percent = percent of monthly Bitcoin txs that were CoinJoins
ROUND(coinjoins.cjs_count / totals.tx_count * 100, 1) AS tx_percent,
-- utxos_percent = percent of monthly Bitcoin utxos that were CoinJoins
ROUND((coinjoins.cjs_inputs / totals.tx_inputs + coinjoins.cjs_outputs / totals.tx_outputs) / 2 * 100, 1) AS utxos_percent,
-- value_percent = percent of monthly Bitcoin volume that took place
-- in CoinJoined transactions
ROUND(coinjoins.cjs_input_val / totals.tx_input_val * 100, 1) AS value_percent
FROM totals
INNER JOIN coinjoins
ON totals.tx_month = coinjoins.cjs_month
ORDER BY value_percent DESC
LIMIT 1;
WITH double_entry_book AS (
-- debits
SELECT
ARRAY_TO_STRING("inputs".value:addresses, ',') AS "address", -- Use the correct JSON path notation
"inputs".value:type AS "type",
- "inputs".value:value AS "value"
FROM CRYPTO.CRYPTO_BITCOIN_CASH.TRANSACTIONS,
LATERAL FLATTEN(INPUT => "inputs") AS "inputs"
WHERE TO_TIMESTAMP("block_timestamp" / 1000000) >= '2014-03-01'
AND TO_TIMESTAMP("block_timestamp" / 1000000) < '2014-04-01'
UNION ALL
-- credits
SELECT
ARRAY_TO_STRING("outputs".value:addresses, ',') AS "address", -- Use the correct JSON path notation
"outputs".value:type AS "type",
"outputs".value:value AS "value"
FROM CRYPTO.CRYPTO_BITCOIN_CASH.TRANSACTIONS,
LATERAL FLATTEN(INPUT => "outputs") AS "outputs"
WHERE TO_TIMESTAMP("block_timestamp" / 1000000) >= '2014-03-01'
AND TO_TIMESTAMP("block_timestamp" / 1000000) < '2014-04-01'
),
address_balances AS (
SELECT
"address",
"type",
SUM("value") AS "balance"
FROM double_entry_book
GROUP BY "address", "type"
),
max_min_balances AS (
SELECT
"type",
MAX("balance") AS max_balance,
MIN("balance") AS min_balance
FROM address_balances
GROUP BY "type"
)
SELECT
REPLACE("type", '"', '') AS "type", -- Replace double quotes with nothing
max_balance,
min_balance
FROM max_min_balances
ORDER BY "type";
WITH
sm_images AS (
SELECT
"SeriesInstanceUID" AS "digital_slide_id",
"StudyInstanceUID" AS "case_id",
"ContainerIdentifier" AS "physical_slide_id",
"PatientID" AS "patient_id",
"TotalPixelMatrixColumns" AS "width",
"TotalPixelMatrixRows" AS "height",
"collection_id",
"crdc_instance_uuid",
"gcs_url",
CAST(
"SharedFunctionalGroupsSequence"[0]."PixelMeasuresSequence"[0]."PixelSpacing"[0] AS FLOAT
) AS "pixel_spacing",
CASE "TransferSyntaxUID"
WHEN '1.2.840.10008.1.2.4.50' THEN 'jpeg'
WHEN '1.2.840.10008.1.2.4.91' THEN 'jpeg2000'
ELSE 'other'
END AS "compression"
FROM
IDC.IDC_V17.DICOM_ALL
WHERE
"Modality" = 'SM'
AND "ImageType"[2] = 'VOLUME'
),
tissue_types AS (
SELECT DISTINCT *
FROM (
SELECT
"SeriesInstanceUID" AS "digital_slide_id",
CASE "steps_unnested2".value:"CodeValue"::STRING
WHEN '17621005' THEN 'normal' -- meaning: 'Normal' (i.e., non-neoplastic)
WHEN '86049000' THEN 'tumor' -- meaning: 'Neoplasm, Primary'
ELSE 'other' -- meaning: 'Neoplasm, Metastatic'
END AS "tissue_type"
FROM
IDC.IDC_V17.DICOM_ALL
CROSS JOIN
LATERAL FLATTEN(input => "SpecimenDescriptionSequence"[0]."PrimaryAnatomicStructureSequence") AS "steps_unnested1"
CROSS JOIN
LATERAL FLATTEN(input => "steps_unnested1".value:"PrimaryAnatomicStructureModifierSequence") AS "steps_unnested2"
)
),
specimen_preparation_sequence_items AS (
SELECT DISTINCT *
FROM (
SELECT
"SeriesInstanceUID" AS "digital_slide_id",
"steps_unnested2".value:"ConceptNameCodeSequence"[0]."CodeMeaning"::STRING AS "item_name",
"steps_unnested2".value:"ConceptCodeSequence"[0]."CodeMeaning"::STRING AS "item_value"
FROM
IDC.IDC_V17.DICOM_ALL
CROSS JOIN
LATERAL FLATTEN(input => "SpecimenDescriptionSequence"[0]."SpecimenPreparationSequence") AS "steps_unnested1"
CROSS JOIN
LATERAL FLATTEN(input => "steps_unnested1".value:"SpecimenPreparationStepContentItemSequence") AS "steps_unnested2"
)
)
SELECT
a.*,
b."tissue_type",
REPLACE(REPLACE(a."collection_id", 'tcga_luad', 'luad'), 'tcga_lusc', 'lscc') AS "cancer_subtype"
FROM
sm_images AS a
JOIN tissue_types AS b
ON b."digital_slide_id" = a."digital_slide_id"
JOIN specimen_preparation_sequence_items AS c
ON c."digital_slide_id" = a."digital_slide_id"
WHERE
(a."collection_id" = 'tcga_luad' OR a."collection_id" = 'tcga_lusc')
AND a."compression" != 'other'
AND (b."tissue_type" = 'normal' OR b."tissue_type" = 'tumor')
AND (c."item_name" = 'Embedding medium' AND c."item_value" = 'Tissue freezing medium')
ORDER BY
a."crdc_instance_uuid";
WITH BlackRace AS (
SELECT CAST("Code" AS INT) AS CODE
FROM DEATH.DEATH.RACE
WHERE LOWER("Description") LIKE '%black%'
)
SELECT
v."Age",
v."Total" AS "Vehicle_Total",
v."Black" AS "Vehicle_Black",
g."Total" AS "Gun_Total",
g."Black" AS "Gun_Black"
FROM (
SELECT
"Age",
COUNT(*) AS "Total",
COUNT_IF("Race" IN (SELECT CODE FROM BlackRace)) AS "Black"
FROM DEATH.DEATH.DEATHRECORDS d
JOIN (
SELECT
DISTINCT e."DeathRecordId" AS "id"
FROM DEATH.DEATH.ENTITYAXISCONDITIONS e
JOIN (
SELECT *
FROM DEATH.DEATH.ICD10CODE
WHERE LOWER("Description") LIKE '%vehicle%'
) c
ON e."Icd10Code" = c."Code"
) f
ON d."Id" = f."id"
WHERE "Age" BETWEEN 12 AND 18
GROUP BY "Age"
) v -- Vehicle
JOIN (
SELECT
"Age",
COUNT(*) AS "Total",
COUNT_IF("Race" IN (SELECT CODE FROM BlackRace)) AS "Black"
FROM DEATH.DEATH.DEATHRECORDS d
JOIN (
SELECT
DISTINCT e."DeathRecordId" AS "id"
FROM DEATH.DEATH.ENTITYAXISCONDITIONS e
JOIN (
SELECT
"Code", "Description"
FROM DEATH.DEATH.ICD10CODE
WHERE "Description" LIKE '%firearm%'
) c
ON e."Icd10Code" = c."Code"
) f
ON d."Id" = f."id"
WHERE "Age" BETWEEN 12 AND 18
GROUP BY "Age"
) g
ON g."Age" = v."Age";
SELECT
TO_DATE(TO_TIMESTAMP_NTZ("block_timestamp" / 1000000)) AS "Date", -- 将时间戳转换为日期格式,除以1000000
TO_CHAR(SUM(
CASE
WHEN "input" LIKE '0x40c10f19%' THEN 1
ELSE -1
END *
CAST(CONCAT('0x', LTRIM(SUBSTRING("input",
CASE
WHEN "input" LIKE '0x40c10f19%' THEN 75
ELSE 11
END, 64), '0')) AS FLOAT) / 1000000)
, '$999,999,999,999') AS "Δ Total Market Value"
FROM
"CRYPTO"."CRYPTO_ETHEREUM"."TRANSACTIONS"
WHERE
TO_DATE(TO_TIMESTAMP_NTZ("block_timestamp" / 1000000)) BETWEEN '2023-01-01' AND '2023-12-31'
AND "to_address" = '0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48' -- USDC Token
AND ("input" LIKE '0x42966c68%' -- Burn
OR "input" LIKE '0x40c10f19%' -- Mint
)
GROUP BY
TO_DATE(TO_TIMESTAMP_NTZ("block_timestamp" / 1000000))
ORDER BY
"Date" DESC;
WITH AA AS (
SELECT
FIRST_VALUE("assignee_harmonized") OVER (PARTITION BY "application_number" ORDER BY "application_number") AS assignee_harmonized,
FIRST_VALUE("filing_date") OVER (PARTITION BY "application_number" ORDER BY "application_number") AS filing_date,
"application_number"
FROM
PATENTS.PATENTS.PUBLICATIONS AS pubs
, LATERAL FLATTEN(input => pubs."cpc") AS c
WHERE
c.value:"code" LIKE 'A61%'
),
PatentApplications AS (
SELECT
ANY_VALUE(assignee_harmonized) as assignee_harmonized,
ANY_VALUE(filing_date) as filing_date
FROM AA
GROUP BY "application_number"
),
AssigneeApplications AS (
SELECT
COUNT(*) AS total_applications,
a.value::STRING AS assignee_name,
CAST(FLOOR(filing_date / 10000) AS INT) AS filing_year
FROM
PatentApplications
, LATERAL FLATTEN(input => assignee_harmonized) AS a
GROUP BY
a.value::STRING, filing_year
),
TotalApplicationsPerAssignee AS (
SELECT
assignee_name,
SUM(total_applications) AS total_applications
FROM
AssigneeApplications
GROUP BY
assignee_name
ORDER BY
total_applications DESC
LIMIT 1
),
MaxYearForTopAssignee AS (
SELECT
aa.assignee_name,
aa.filing_year,
aa.total_applications
FROM
AssigneeApplications aa
INNER JOIN
TotalApplicationsPerAssignee tapa ON aa.assignee_name = tapa.assignee_name
ORDER BY
aa.total_applications DESC
LIMIT 1
)
SELECT filing_year
FROM
MaxYearForTopAssignee
WITH double_entry_book AS (
-- Debits
SELECT
"to_address" AS "address",
"value" AS "value"
FROM
CRYPTO.CRYPTO_ETHEREUM_CLASSIC.TRACES
WHERE
"to_address" IS NOT NULL
AND "status" = 1
AND ("call_type" NOT IN ('delegatecall', 'callcode', 'staticcall') OR "call_type" IS NULL)
AND TO_DATE(TO_TIMESTAMP("block_timestamp" / 1000000)) = '2016-10-14'
UNION ALL
-- Credits
SELECT
"from_address" AS "address",
- "value" AS "value"
FROM
CRYPTO.CRYPTO_ETHEREUM_CLASSIC.TRACES
WHERE
"from_address" IS NOT NULL
AND "status" = 1
AND ("call_type" NOT IN ('delegatecall', 'callcode', 'staticcall') OR "call_type" IS NULL)
AND TO_DATE(TO_TIMESTAMP("block_timestamp" / 1000000)) = '2016-10-14'
UNION ALL
-- Transaction Fees Debits
SELECT
"miner" AS "address",
SUM(CAST("receipt_gas_used" AS NUMERIC) * CAST("gas_price" AS NUMERIC)) AS "value"
FROM
CRYPTO.CRYPTO_ETHEREUM_CLASSIC.TRANSACTIONS AS "transactions"
JOIN
CRYPTO.CRYPTO_ETHEREUM_CLASSIC.BLOCKS AS "blocks"
ON "blocks"."number" = "transactions"."block_number"
WHERE
TO_DATE(TO_TIMESTAMP("block_timestamp" / 1000000)) = '2016-10-14'
GROUP BY
"blocks"."miner"
UNION ALL
-- Transaction Fees Credits
SELECT
"from_address" AS "address",
-(CAST("receipt_gas_used" AS NUMERIC) * CAST("gas_price" AS NUMERIC)) AS "value"
FROM
CRYPTO.CRYPTO_ETHEREUM_CLASSIC.TRANSACTIONS
WHERE
TO_DATE(TO_TIMESTAMP("block_timestamp" / 1000000)) = '2016-10-14'
),
net_changes AS (
SELECT
"address",
SUM("value") AS "net_change"
FROM
double_entry_book
GROUP BY
"address"
)
SELECT
MAX("net_change") AS "max_net_change",
MIN("net_change") AS "min_net_change"
FROM
net_changes;
WITH PatentApplications AS (
SELECT
"assignee_harmonized" AS assignee_harmonized,
"filing_date" AS filing_date,
"country_code" AS country_code,
"application_number" AS application_number
FROM
PATENTS.PATENTS.PUBLICATIONS AS pubs,
LATERAL FLATTEN(input => pubs."cpc") AS c
WHERE c.value:"code" LIKE 'A01B3%'
),
AssigneeApplications AS (
SELECT
COUNT(*) AS year_country_cnt,
a.value:"name" AS assignee_name,
CAST(FLOOR(filing_date / 10000) AS INT) AS filing_year,
apps.country_code as country_code
FROM
PatentApplications as apps,
LATERAL FLATTEN(input => assignee_harmonized) AS a
GROUP BY
assignee_name, filing_year, country_code
),
RankedApplications AS (
SELECT
assignee_name,
filing_year,
country_code,
year_country_cnt,
SUM(year_country_cnt) OVER (PARTITION BY assignee_name, filing_year) AS total_cnt,
ROW_NUMBER() OVER (PARTITION BY assignee_name, filing_year ORDER BY year_country_cnt DESC) AS rn
FROM
AssigneeApplications
),
AggregatedData AS (
SELECT
total_cnt AS year_cnt,
assignee_name,
filing_year,
country_code
FROM
RankedApplications
WHERE
rn = 1
)
SELECT
total_count,
REPLACE(assignee_name, '"', '') AS assignee_name,
year_cnt,
filing_year,
country_code
FROM (
SELECT
year_cnt,
assignee_name,
filing_year,
country_code,
SUM(year_cnt) OVER (PARTITION BY assignee_name) AS total_count,
ROW_NUMBER() OVER (PARTITION BY assignee_name ORDER BY year_cnt DESC) AS rn
FROM
AggregatedData
ORDER BY assignee_name
) sub
WHERE rn = 1
ORDER BY total_count
DESC
LIMIT 3
WITH LatestWeek AS (
SELECT
DATEADD(WEEK, -52, MAX("week")) AS "last_year_week"
FROM
GOOGLE_TRENDS.GOOGLE_TRENDS.TOP_RISING_TERMS
),
LatestRefreshDate AS (
SELECT
MAX("refresh_date") AS "latest_refresh_date"
FROM
GOOGLE_TRENDS.GOOGLE_TRENDS.TOP_RISING_TERMS
),
RankedTerms AS (
SELECT
"term",
"week",
CASE WHEN "score" IS NULL THEN NULL ELSE "dma_name" END AS "dma_name",
"rank",
"score",
ROW_NUMBER() OVER (
PARTITION BY "term", "week"
ORDER BY "score" DESC
) AS rn
FROM
GOOGLE_TRENDS.GOOGLE_TRENDS.TOP_RISING_TERMS
WHERE
"week" = (SELECT "last_year_week" FROM LatestWeek)
AND "refresh_date" = (SELECT "latest_refresh_date" FROM LatestRefreshDate)
)
SELECT
"term"
FROM
RankedTerms
WHERE
rn = 1
ORDER BY
"rank"
LIMIT 1;
WITH sub AS (
SELECT
"users"."id",
CAST(TO_TIMESTAMP(MAX("users"."creation_date") / 1000000.0) AS DATE) AS "user_creation_date", -- 使用 MAX 聚合 creation_date 并转换为 DATE
MAX("users"."reputation") AS "reputation",
SUM(CASE WHEN badges."user_id" IS NULL THEN 0 ELSE 1 END) AS "num_badges"
FROM "STACKOVERFLOW"."STACKOVERFLOW"."USERS" "users"
LEFT JOIN "STACKOVERFLOW"."STACKOVERFLOW"."BADGES" badges
ON "users"."id" = badges."user_id"
WHERE CAST(TO_TIMESTAMP("users"."creation_date" / 1000000.0) AS DATE) <= DATE '2021-10-01'
GROUP BY "users"."id"
)
SELECT
DATEDIFF(YEAR, "user_creation_date", DATE '2021-10-01') AS "user_tenure",
COUNT(1) AS "Num_Users",
AVG("reputation") AS "Avg_Reputation",
AVG("num_badges") AS "Avg_Num_Badges"
FROM sub
GROUP BY "user_tenure"
ORDER BY "user_tenure";
WITH fam AS (
SELECT DISTINCT
"family_id"
FROM
"PATENTS_GOOGLE"."PATENTS_GOOGLE"."PUBLICATIONS"
),
crossover AS (
SELECT
"publication_number",
"family_id"
FROM
"PATENTS_GOOGLE"."PATENTS_GOOGLE"."PUBLICATIONS"
),
pub AS (
SELECT
"family_id",
MIN("publication_date") AS "publication_date",
LISTAGG("publication_number", ',') WITHIN GROUP (ORDER BY "publication_number") AS "publication_number",
LISTAGG("country_code", ',') WITHIN GROUP (ORDER BY "country_code") AS "country_code"
FROM
"PATENTS_GOOGLE"."PATENTS_GOOGLE"."PUBLICATIONS" AS p
GROUP BY
"family_id"
),
tech_class AS (
SELECT
p."family_id",
LISTAGG(DISTINCT cpc.value:"code"::STRING, ',') WITHIN GROUP (ORDER BY cpc.value:"code"::STRING) AS "cpc",
LISTAGG(DISTINCT ipc.value:"code"::STRING, ',') WITHIN GROUP (ORDER BY ipc.value:"code"::STRING) AS "ipc"
FROM
"PATENTS_GOOGLE"."PATENTS_GOOGLE"."PUBLICATIONS" AS p
CROSS JOIN LATERAL FLATTEN(input => p."cpc") AS cpc
CROSS JOIN LATERAL FLATTEN(input => p."ipc") AS ipc
GROUP BY
p."family_id"
),
cit AS (
SELECT
p."family_id",
LISTAGG(crossover."family_id", ',') WITHIN GROUP (ORDER BY crossover."family_id" ASC) AS "citation"
FROM
"PATENTS_GOOGLE"."PATENTS_GOOGLE"."PUBLICATIONS" AS p
CROSS JOIN LATERAL FLATTEN(input => p."citation") AS citation
LEFT JOIN
crossover
ON
citation.value:"publication_number"::STRING = crossover."publication_number"
GROUP BY
p."family_id"
),
tmp_gpr AS (
SELECT
"family_id",
LISTAGG(crossover."publication_number", ',') AS "cited_by_publication_number"
FROM
"PATENTS_GOOGLE"."PATENTS_GOOGLE"."ABS_AND_EMB" AS p
CROSS JOIN LATERAL FLATTEN(input => p."cited_by") AS cited_by
LEFT JOIN
crossover
ON
cited_by.value:"publication_number"::STRING = crossover."publication_number"
GROUP BY
"family_id"
),
gpr AS (
SELECT
tmp_gpr."family_id",
LISTAGG(crossover."family_id", ',') WITHIN GROUP (ORDER BY crossover."family_id" ASC) AS "cited_by"
FROM
tmp_gpr
CROSS JOIN LATERAL FLATTEN(input => SPLIT(tmp_gpr."cited_by_publication_number", ',')) AS cited_by_publication_number
LEFT JOIN
crossover
ON
cited_by_publication_number.value::STRING = crossover."publication_number"
GROUP BY
tmp_gpr."family_id"
)
SELECT
fam."family_id",
pub."publication_date",
pub."publication_number",
pub."country_code",
tech_class."cpc",
tech_class."ipc",
cit."citation",
gpr."cited_by"
FROM
fam
LEFT JOIN pub ON fam."family_id" = pub."family_id"
LEFT JOIN tech_class ON fam."family_id" = tech_class."family_id"
LEFT JOIN cit ON fam."family_id" = cit."family_id"
LEFT JOIN gpr ON fam."family_id" = gpr."family_id"
WHERE
pub."publication_date" BETWEEN 20150101 AND 20150131;
SELECT
patent."title",
patent."abstract",
app."date" AS publication_date,
filterData."bkwdCitations",
filterData."fwrdCitations_5"
FROM
"PATENTSVIEW"."PATENTSVIEW"."PATENT" AS patent
JOIN
"PATENTSVIEW"."PATENTSVIEW"."APPLICATION" AS app
ON app."patent_id" = patent."id"
JOIN (
SELECT
DISTINCT cpc."patent_id",
IFNULL(citation_5."bkwdCitations", 0) AS "bkwdCitations",
IFNULL(citation_5."fwrdCitations_5", 0) AS "fwrdCitations_5"
FROM
"PATENTSVIEW"."PATENTSVIEW"."CPC_CURRENT" AS cpc
LEFT JOIN (
SELECT
b."patent_id",
b."bkwdCitations",
f."fwrdCitations_5"
FROM (
SELECT
cited."citation_id" AS "patent_id",
IFNULL(COUNT(*), 0) AS "fwrdCitations_5"
FROM
"PATENTSVIEW"."PATENTSVIEW"."USPATENTCITATION" AS cited
JOIN
"PATENTSVIEW"."PATENTSVIEW"."APPLICATION" AS apps
ON cited."citation_id" = apps."patent_id"
WHERE
apps."country" = 'US'
AND cited."date" >= apps."date"
AND TRY_CAST(cited."date" AS DATE) <= DATEADD(YEAR, 5, TRY_CAST(apps."date" AS DATE)) -- 5-year citation window
GROUP BY
cited."citation_id"
) AS f
JOIN (
SELECT
cited."patent_id",
IFNULL(COUNT(*), 0) AS "bkwdCitations"
FROM
"PATENTSVIEW"."PATENTSVIEW"."USPATENTCITATION" AS cited
JOIN
"PATENTSVIEW"."PATENTSVIEW"."APPLICATION" AS apps
ON cited."patent_id" = apps."patent_id"
WHERE
apps."country" = 'US'
AND cited."date" < apps."date" -- backward citation count
GROUP BY
cited."patent_id"
) AS b
ON b."patent_id" = f."patent_id"
WHERE
b."bkwdCitations" IS NOT NULL
AND f."fwrdCitations_5" IS NOT NULL
) AS citation_5
ON cpc."patent_id" = citation_5."patent_id"
WHERE
cpc."subsection_id" IN ('C05', 'C06', 'C07', 'C08', 'C09', 'C10', 'C11', 'C12', 'C13')
OR cpc."group_id" IN ('A01G', 'A01H', 'A61K', 'A61P', 'A61Q', 'B01F', 'B01J', 'B81B', 'B82B', 'B82Y', 'G01N', 'G16H')
) AS filterData
ON app."patent_id" = filterData."patent_id"
WHERE
TRY_CAST(app."date" AS DATE) < '2014-02-01'
AND TRY_CAST(app."date" AS DATE) >= '2014-01-01';
WITH
cohortExpr AS (
SELECT
"sample_barcode",
LOG(10, "normalized_count") AS "expr"
FROM
"TCGA_HG19_DATA_V0"."TCGA_HG19_DATA_V0"."RNASEQ_GENE_EXPRESSION_UNC_RSEM"
WHERE
"project_short_name" = 'TCGA-BRCA'
AND "HGNC_gene_symbol" = 'TP53'
AND "normalized_count" IS NOT NULL
AND "normalized_count" > 0
),
cohortVar AS (
SELECT
"Variant_Type",
"sample_barcode_tumor" AS "sample_barcode"
FROM
"TCGA_HG19_DATA_V0"."TCGA_HG19_DATA_V0"."SOMATIC_MUTATION_MC3"
WHERE
"SYMBOL" = 'TP53'
),
cohort AS (
SELECT
e."sample_barcode" AS "sample_barcode",
v."Variant_Type" AS "group_name",
e."expr"
FROM
cohortExpr e
JOIN
cohortVar v
ON
e."sample_barcode" = v."sample_barcode"
),
grandMeanTable AS (
SELECT
AVG("expr") AS "grand_mean"
FROM
cohort
),
groupMeansTable AS (
SELECT
AVG("expr") AS "group_mean",
"group_name",
COUNT("sample_barcode") AS "n"
FROM
cohort
GROUP BY
"group_name"
),
ssBetween AS (
SELECT
g."group_name",
g."group_mean",
gm."grand_mean",
g."n",
g."n" * POW(g."group_mean" - gm."grand_mean", 2) AS "n_diff_sq"
FROM
groupMeansTable g
CROSS JOIN
grandMeanTable gm
),
ssWithin AS (
SELECT
c."group_name" AS "group_name",
c."expr",
b."group_mean",
b."n" AS "n",
POW(c."expr" - b."group_mean", 2) AS "s2"
FROM
cohort c
JOIN
ssBetween b
ON
c."group_name" = b."group_name"
),
numerator AS (
SELECT
SUM("n_diff_sq") / (COUNT("group_name") - 1) AS "mean_sq_between"
FROM
ssBetween
),
denominator AS (
SELECT
COUNT(DISTINCT "group_name") AS "k",
COUNT("group_name") AS "n",
SUM("s2") / (COUNT("group_name") - COUNT(DISTINCT "group_name")) AS "mean_sq_within"
FROM
ssWithin
)
SELECT
"n",
"k",
"mean_sq_between",
"mean_sq_within",
"mean_sq_between" / "mean_sq_within" AS "F"
FROM
numerator,
denominator;
WITH
table1 AS (
SELECT
"Symbol" AS "symbol",
AVG(LOG(10, "normalized_count" + 1)) AS "data",
"ParticipantBarcode"
FROM
PANCANCER_ATLAS_1.PANCANCER_ATLAS_FILTERED.EBPP_ADJUSTPANCAN_ILLUMINAHISEQ_RNASEQV2_GENEXP_FILTERED
WHERE
"Study" = 'LGG'
AND "Symbol" = 'IGF2'
AND "normalized_count" IS NOT NULL
GROUP BY
"ParticipantBarcode", "symbol"
),
table2 AS (
SELECT
"symbol",
"avgdata" AS "data",
"ParticipantBarcode"
FROM (
SELECT
'icd_o_3_histology' AS "symbol",
"icd_o_3_histology" AS "avgdata",
"bcr_patient_barcode" AS "ParticipantBarcode"
FROM
PANCANCER_ATLAS_1.PANCANCER_ATLAS_FILTERED.CLINICAL_PANCAN_PATIENT_WITH_FOLLOWUP_FILTERED
WHERE
"acronym" = 'LGG'
AND "icd_o_3_histology" IS NOT NULL
AND NOT REGEXP_LIKE("icd_o_3_histology", '^(\\[.*\\]$)')
)
),
table_data AS (
SELECT
n1."data" AS "data1",
n2."data" AS "data2",
n1."ParticipantBarcode"
FROM
table1 AS n1
INNER JOIN
table2 AS n2
ON
n1."ParticipantBarcode" = n2."ParticipantBarcode"
)
SELECT
"data2" AS "Histology_Type",
AVG("data1") AS "Average_Log_Expression"
FROM
table_data
GROUP BY
"data2";
WITH cohort AS (
SELECT "case_barcode"
FROM "TCGA_HG38_DATA_V0"."TCGA_BIOCLIN_V0"."CLINICAL"
WHERE "project_short_name" = 'TCGA-BRCA'
AND "age_at_diagnosis" <= 80
AND "pathologic_stage" IN ('Stage I', 'Stage II', 'Stage IIA')
),
table1 AS (
SELECT
"symbol",
"data" AS "rnkdata",
"ParticipantBarcode"
FROM (
SELECT
"gene_name" AS "symbol",
AVG(LOG(10, "HTSeq__Counts" + 1)) AS "data",
"case_barcode" AS "ParticipantBarcode"
FROM "TCGA_HG38_DATA_V0"."TCGA_HG38_DATA_V0"."RNASEQ_GENE_EXPRESSION"
WHERE "case_barcode" IN (SELECT "case_barcode" FROM cohort)
AND "gene_name" = 'SNORA31'
AND "HTSeq__Counts" IS NOT NULL
GROUP BY
"ParticipantBarcode", "symbol"
)
),
table2 AS (
SELECT
"symbol",
"data" AS "rnkdata",
"ParticipantBarcode"
FROM (
SELECT
"mirna_id" AS "symbol",
AVG("reads_per_million_miRNA_mapped") AS "data",
"case_barcode" AS "ParticipantBarcode"
FROM "TCGA_HG38_DATA_V0"."TCGA_HG38_DATA_V0"."MIRNASEQ_EXPRESSION"
WHERE "case_barcode" IN (SELECT "case_barcode" FROM cohort)
AND "mirna_id" IS NOT NULL
AND "reads_per_million_miRNA_mapped" IS NOT NULL
GROUP BY
"ParticipantBarcode", "symbol"
)
),
summ_table AS (
SELECT
n1."symbol" AS "symbol1",
n2."symbol" AS "symbol2",
COUNT(n1."ParticipantBarcode") AS "n",
CORR(n1."rnkdata", n2."rnkdata") AS "correlation"
FROM
table1 AS n1
INNER JOIN
table2 AS n2
ON
n1."ParticipantBarcode" = n2."ParticipantBarcode"
GROUP BY
"symbol1", "symbol2"
)
SELECT
"symbol1",
"symbol2",
ABS("correlation") * SQRT(( "n" - 2 ) / (1 - "correlation" * "correlation")) AS "t"
FROM
summ_table
WHERE
"n" > 25
AND ABS("correlation") >= 0.3
AND ABS("correlation") < 1.0;
WITH
table1 AS (
SELECT
"histological_type" AS "data1",
"bcr_patient_barcode" AS "ParticipantBarcode"
FROM
"PANCANCER_ATLAS_1"."PANCANCER_ATLAS_FILTERED"."CLINICAL_PANCAN_PATIENT_WITH_FOLLOWUP_FILTERED"
WHERE
"acronym" = 'BRCA'
AND "histological_type" IS NOT NULL
),
table2 AS (
SELECT
"Hugo_Symbol" AS "symbol",
"ParticipantBarcode"
FROM
"PANCANCER_ATLAS_1"."PANCANCER_ATLAS_FILTERED"."MC3_MAF_V5_ONE_PER_TUMOR_SAMPLE"
WHERE
"Study" = 'BRCA'
AND "Hugo_Symbol" = 'CDH1'
AND "FILTER" = 'PASS'
GROUP BY
"ParticipantBarcode", "symbol"
),
summ_table AS (
SELECT
n1."data1",
CASE
WHEN n2."ParticipantBarcode" IS NULL THEN 'NO'
ELSE 'YES'
END AS "data2",
COUNT(*) AS "Nij"
FROM
table1 AS n1
LEFT JOIN
table2 AS n2
ON n1."ParticipantBarcode" = n2."ParticipantBarcode"
GROUP BY
n1."data1", "data2"
),
percentages AS (
SELECT
"data1",
SUM(CASE WHEN "data2" = 'YES' THEN "Nij" ELSE 0 END) AS "mutation_count",
SUM("Nij") AS "total",
SUM(CASE WHEN "data2" = 'YES' THEN "Nij" ELSE 0 END) / SUM("Nij") AS "mutation_percentage"
FROM
summ_table
GROUP BY
"data1"
)
SELECT
"data1" AS "Histological_Type"
FROM
percentages
ORDER BY
"mutation_percentage" DESC
LIMIT 5;
WITH
table1 AS (
SELECT
"symbol",
"avgdata" AS "data",
"ParticipantBarcode"
FROM (
SELECT
'histological_type' AS "symbol",
"histological_type" AS "avgdata",
"bcr_patient_barcode" AS "ParticipantBarcode"
FROM
"PANCANCER_ATLAS_1"."PANCANCER_ATLAS_FILTERED"."CLINICAL_PANCAN_PATIENT_WITH_FOLLOWUP_FILTERED"
WHERE
"acronym" = 'BRCA'
AND "histological_type" IS NOT NULL
)
),
table2 AS (
SELECT
"symbol",
"ParticipantBarcode"
FROM (
SELECT
"Hugo_Symbol" AS "symbol",
"ParticipantBarcode" AS "ParticipantBarcode"
FROM
"PANCANCER_ATLAS_1"."PANCANCER_ATLAS_FILTERED"."MC3_MAF_V5_ONE_PER_TUMOR_SAMPLE"
WHERE
"Study" = 'BRCA'
AND "Hugo_Symbol" = 'CDH1'
AND "FILTER" = 'PASS'
GROUP BY
"ParticipantBarcode", "symbol"
)
),
summ_table AS (
SELECT
n1."data" AS "data1",
CASE
WHEN n2."ParticipantBarcode" IS NULL THEN 'NO'
ELSE 'YES'
END AS "data2",
COUNT(*) AS "Nij"
FROM
table1 AS n1
LEFT JOIN
table2 AS n2
ON n1."ParticipantBarcode" = n2."ParticipantBarcode"
GROUP BY
n1."data", "data2"
),
expected_table AS (
SELECT
"data1",
"data2"
FROM (
SELECT
"data1",
SUM("Nij") AS "Ni"
FROM
summ_table
GROUP BY
"data1"
) AS Ni_table
CROSS JOIN (
SELECT
"data2",
SUM("Nij") AS "Nj"
FROM
summ_table
GROUP BY
"data2"
) AS Nj_table
WHERE
Ni_table."Ni" > 10
AND Nj_table."Nj" > 10
),
contingency_table AS (
SELECT
T1."data1",
T1."data2",
COALESCE(T2."Nij", 0) AS "Nij",
(SUM(T2."Nij") OVER (PARTITION BY T1."data1")) *
(SUM(T2."Nij") OVER (PARTITION BY T1."data2")) /
SUM(T2."Nij") OVER () AS "E_nij"
FROM
expected_table AS T1
LEFT JOIN
summ_table AS T2
ON
T1."data1" = T2."data1"
AND T1."data2" = T2."data2"
)
SELECT
SUM( ( "Nij" - "E_nij" ) * ( "Nij" - "E_nij" ) / "E_nij" ) AS "Chi2"
FROM
contingency_table;
WITH copy AS (
SELECT
"case_barcode",
"chromosome",
"start_pos",
"end_pos",
MAX("copy_number") AS "copy_number"
FROM
"TCGA_MITELMAN"."TCGA_VERSIONED"."COPY_NUMBER_SEGMENT_ALLELIC_HG38_GDC_R23"
WHERE
"project_short_name" = 'TCGA-KIRC'
GROUP BY
"case_barcode",
"chromosome",
"start_pos",
"end_pos"
),
total_cases AS (
SELECT COUNT(DISTINCT "case_barcode") AS "total"
FROM copy
),
cytob AS (
SELECT
"chromosome",
"cytoband_name",
"hg38_start",
"hg38_stop"
FROM
"TCGA_MITELMAN"."PROD"."CYTOBANDS_HG38"
),
joined AS (
SELECT
cytob."chromosome",
cytob."cytoband_name",
cytob."hg38_start",
cytob."hg38_stop",
copy."case_barcode",
copy."copy_number"
FROM
copy
LEFT JOIN cytob
ON cytob."chromosome" = copy."chromosome"
WHERE
(cytob."hg38_start" >= copy."start_pos" AND copy."end_pos" >= cytob."hg38_start")
OR (copy."start_pos" >= cytob."hg38_start" AND copy."start_pos" <= cytob."hg38_stop")
),
cbands AS (
SELECT
"chromosome",
"cytoband_name",
"hg38_start",
"hg38_stop",
"case_barcode",
MAX("copy_number") AS "copy_number"
FROM
joined
GROUP BY
"chromosome",
"cytoband_name",
"hg38_start",
"hg38_stop",
"case_barcode"
),
aberrations AS (
SELECT
"chromosome",
"cytoband_name",
-- Amplifications: more than two copies for diploid > 4
SUM( CASE WHEN "copy_number" > 3 THEN 1 ELSE 0 END ) AS "total_amp",
-- Gains: at most two extra copies
SUM( CASE WHEN "copy_number" = 3 THEN 1 ELSE 0 END ) AS "total_gain",
-- Homozygous deletions, or complete deletions
SUM( CASE WHEN "copy_number" = 0 THEN 1 ELSE 0 END ) AS "total_homodel",
-- Heterozygous deletions, 1 copy lost
SUM( CASE WHEN "copy_number" = 1 THEN 1 ELSE 0 END ) AS "total_heterodel",
-- Normal for Diploid = 2
SUM( CASE WHEN "copy_number" = 2 THEN 1 ELSE 0 END ) AS "total_normal"
FROM
cbands
GROUP BY
"chromosome",
"cytoband_name"
)
SELECT
aberrations."chromosome",
aberrations."cytoband_name",
total_cases."total",
100 * aberrations."total_amp" / total_cases."total" AS "freq_amp",
100 * aberrations."total_gain" / total_cases."total" AS "freq_gain",
100 * aberrations."total_homodel" / total_cases."total" AS "freq_homodel",
100 * aberrations."total_heterodel" / total_cases."total" AS "freq_heterodel",
100 * aberrations."total_normal" / total_cases."total" AS "freq_normal"
FROM
aberrations,
total_cases
ORDER BY
aberrations."chromosome",
aberrations."cytoband_name";
WITH UserPairUpvotes AS (
SELECT
ToUsers."UserName" AS "ToUserName",
FromUsers."UserName" AS "FromUserName",
COUNT(DISTINCT "ForumMessageVotes"."Id") AS "UpvoteCount"
FROM META_KAGGLE.META_KAGGLE.FORUMMESSAGEVOTES AS "ForumMessageVotes"
INNER JOIN META_KAGGLE.META_KAGGLE.USERS AS FromUsers
ON FromUsers."Id" = "ForumMessageVotes"."FromUserId"
INNER JOIN META_KAGGLE.META_KAGGLE.USERS AS ToUsers
ON ToUsers."Id" = "ForumMessageVotes"."ToUserId"
GROUP BY
ToUsers."UserName",
FromUsers."UserName"
),
TopPairs AS (
SELECT
"ToUserName",
"FromUserName",
"UpvoteCount",
ROW_NUMBER() OVER (ORDER BY "UpvoteCount" DESC) AS "Rank"
FROM UserPairUpvotes
),
ReciprocalUpvotes AS (
SELECT
t."ToUserName",
t."FromUserName",
t."UpvoteCount" AS "UpvotesReceived",
COALESCE(u."UpvoteCount", 0) AS "UpvotesGiven"
FROM TopPairs t
LEFT JOIN UserPairUpvotes u
ON t."ToUserName" = u."FromUserName" AND t."FromUserName" = u."ToUserName"
WHERE t."Rank" = 1
)
SELECT
"ToUserName" AS "UpvotedUserName",
"FromUserName" AS "UpvotingUserName",
"UpvotesReceived" AS "UpvotesReceivedByUpvotedUser",
"UpvotesGiven" AS "UpvotesGivenByUpvotedUser"
FROM ReciprocalUpvotes
ORDER BY "UpvotesReceived" DESC, "UpvotesGiven" DESC;
WITH copy AS (
SELECT
"case_barcode",
"chromosome",
"start_pos",
"end_pos",
MAX("copy_number") AS "copy_number"
FROM
"TCGA_MITELMAN"."TCGA_VERSIONED"."COPY_NUMBER_SEGMENT_ALLELIC_HG38_GDC_R23"
WHERE
"project_short_name" = 'TCGA-LAML'
GROUP BY
"case_barcode",
"chromosome",
"start_pos",
"end_pos"
),
total_cases AS (
SELECT COUNT(DISTINCT "case_barcode") AS "total"
FROM copy
),
cytob AS (
SELECT
"chromosome",
"cytoband_name",
"hg38_start",
"hg38_stop"
FROM
"TCGA_MITELMAN"."PROD"."CYTOBANDS_HG38"
),
joined AS (
SELECT
cytob."chromosome",
cytob."cytoband_name",
cytob."hg38_start",
cytob."hg38_stop",
copy."case_barcode",
(ABS(cytob."hg38_stop" - cytob."hg38_start") + ABS(copy."end_pos" - copy."start_pos")
- ABS(cytob."hg38_stop" - copy."end_pos") - ABS(cytob."hg38_start" - copy."start_pos")) / 2.0 AS "overlap",
copy."copy_number"
FROM
copy
LEFT JOIN
cytob
ON
cytob."chromosome" = copy."chromosome"
WHERE
(cytob."hg38_start" >= copy."start_pos" AND copy."end_pos" >= cytob."hg38_start")
OR (copy."start_pos" >= cytob."hg38_start" AND copy."start_pos" <= cytob."hg38_stop")
),
INFO AS (
SELECT
"chromosome",
"cytoband_name",
"hg38_start",
"hg38_stop",
"case_barcode",
ROUND(SUM("overlap" * "copy_number") / SUM("overlap")) AS "copy_number"
FROM
joined
GROUP BY
"chromosome", "cytoband_name", "hg38_start", "hg38_stop", "case_barcode"
)
SELECT
"case_barcode"
FROM
INFO
WHERE
"chromosome" = 'chr15'
AND "cytoband_name" = '15q11'
ORDER BY
"copy_number" DESC
LIMIT 1;
WITH
event_data AS (
SELECT
"type",
EXTRACT(YEAR FROM TO_TIMESTAMP("created_at" / 1000000)) AS "year",
EXTRACT(QUARTER FROM TO_TIMESTAMP("created_at" / 1000000)) AS "quarter",
REGEXP_REPLACE(
"repo"::variant:"url"::string,
'https:\\/\\/github\\.com\\/|https:\\/\\/api\\.github\\.com\\/repos\\/',
''
) AS "name"
FROM GITHUB_REPOS_DATE.DAY._20230118
),
repo_languages AS (
SELECT
"repo_name" AS "name",
"lang"
FROM (
SELECT
"repo_name",
FIRST_VALUE("language") OVER (
PARTITION BY "repo_name" ORDER BY "bytes" DESC
) AS "lang"
FROM (
SELECT
"repo_name",
"language".value:"name" AS "language",
"language".value:"bytes" AS "bytes"
FROM GITHUB_REPOS_DATE.GITHUB_REPOS.LANGUAGES,
LATERAL FLATTEN(INPUT => "language") AS "language"
)
)
WHERE "lang" IS NOT NULL
GROUP BY "repo_name", "lang"
),
joined_data AS (
SELECT
a."type" AS "type",
b."lang" AS "language",
a."year" AS "year",
a."quarter" AS "quarter"
FROM event_data a
JOIN repo_languages b
ON a."name" = b."name"
),
count_data AS (
SELECT
"language",
"year",
"quarter",
"type",
COUNT(*) AS "count"
FROM joined_data
GROUP BY "type", "language", "year", "quarter"
ORDER BY "year", "quarter", "count" DESC
)
SELECT
REPLACE("language", '"', '') AS "language_name",
"count"
FROM count_data
WHERE "count" >= 5
AND "type" = 'PullRequestEvent';
WITH tokenInfo AS (
SELECT "address"
FROM "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."TOKENS"
WHERE "name" = 'BNB'
),
receivedTx AS (
SELECT "tx"."to_address" AS "addr",
"tokens"."name" AS "name",
SUM(CAST("tx"."value" AS FLOAT) / POWER(10, 18)) AS "amount_received"
FROM "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."TOKEN_TRANSFERS" AS "tx"
JOIN tokenInfo ON "tx"."token_address" = tokenInfo."address"
JOIN "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."TOKENS" AS "tokens"
ON "tx"."token_address" = "tokens"."address"
WHERE "tx"."to_address" <> '0x0000000000000000000000000000000000000000'
GROUP BY "tx"."to_address", "tokens"."name"
),
sentTx AS (
SELECT "tx"."from_address" AS "addr",
"tokens"."name" AS "name",
SUM(CAST("tx"."value" AS FLOAT) / POWER(10, 18)) AS "amount_sent"
FROM "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."TOKEN_TRANSFERS" AS "tx"
JOIN tokenInfo ON "tx"."token_address" = tokenInfo."address"
JOIN "ETHEREUM_BLOCKCHAIN"."ETHEREUM_BLOCKCHAIN"."TOKENS" AS "tokens"
ON "tx"."token_address" = "tokens"."address"
WHERE "tx"."from_address" <> '0x0000000000000000000000000000000000000000'
GROUP BY "tx"."from_address", "tokens"."name"
),
walletBalances AS (
SELECT r."addr",
COALESCE(SUM(r."amount_received"), 0) - COALESCE(SUM(s."amount_sent"), 0) AS "balance"
FROM receivedTx AS r
LEFT JOIN sentTx AS s
ON r."addr" = s."addr"
GROUP BY r."addr"
)
SELECT
SUM("balance") AS "circulating_supply"
FROM walletBalances;
WITH content_extracted AS (
SELECT
"D"."id" AS "id",
"repo_name",
"path",
SPLIT("content", '\n') AS "lines",
"language_name"
FROM
(
SELECT
"id",
"content"
FROM
"GITHUB_REPOS"."GITHUB_REPOS"."SAMPLE_CONTENTS"
) AS "D"
INNER JOIN
(
SELECT
"id",
"C"."repo_name" AS "repo_name",
"path",
"language_name"
FROM
(
SELECT
"id",
"repo_name",
"path"
FROM
"GITHUB_REPOS"."GITHUB_REPOS"."SAMPLE_FILES"
WHERE
LOWER("path") LIKE '%readme.md'
) AS "C"
INNER JOIN
(
SELECT
"repo_name",
"language_struct".value:"name" AS "language_name"
FROM
(
SELECT
"repo_name",
"language"
FROM
"GITHUB_REPOS"."GITHUB_REPOS"."LANGUAGES"
)
CROSS JOIN
LATERAL FLATTEN(INPUT => "language") AS "language_struct"
) AS "F"
ON
"C"."repo_name" = "F"."repo_name"
) AS "E"
ON
"E"."id" = "D"."id"
),
non_empty_lines AS (
SELECT
"line".value AS "line_",
"language_name"
FROM
content_extracted,
LATERAL FLATTEN(INPUT => "lines") AS "line"
WHERE
TRIM("line".value) != ''
AND NOT STARTSWITH(TRIM("line".value), '#')
AND NOT STARTSWITH(TRIM("line".value), '//')
),
aggregated_languages AS (
SELECT
"line_",
COUNT(*) AS "frequency",
ARRAY_AGG("language_name") AS "languages"
FROM
non_empty_lines
GROUP BY
"line_"
)
SELECT
REGEXP_REPLACE("line_", '^"|"$', '') AS "line",
"frequency",
ARRAY_TO_STRING(ARRAY_SORT("languages"), ', ') AS "languages_sorted"
FROM
aggregated_languages
ORDER BY
"frequency" DESC;
WITH patents_sample AS (
SELECT
t1."publication_number",
t1."application_number"
FROM
PATENTS.PATENTS.PUBLICATIONS t1
WHERE
TO_DATE(
CASE
WHEN t1."grant_date" != 0 THEN TO_CHAR(t1."grant_date")
ELSE NULL
END,
'YYYYMMDD'
) BETWEEN TO_DATE('20100101', 'YYYYMMDD') AND TO_DATE('20101231', 'YYYYMMDD')
),
forward_citation AS (
SELECT
patents_sample."publication_number",
COUNT(DISTINCT t3."citing_application_number") AS "forward_citations"
FROM
patents_sample
LEFT JOIN (
SELECT
x2."publication_number",
TO_DATE(
CASE
WHEN x2."filing_date" != 0 THEN TO_CHAR(x2."filing_date")
ELSE NULL
END,
'YYYYMMDD'
) AS "filing_date"
FROM
PATENTS.PATENTS.PUBLICATIONS x2
WHERE
x2."filing_date" != 0
) t2
ON t2."publication_number" = patents_sample."publication_number"
LEFT JOIN (
SELECT
x3."publication_number" AS "citing_publication_number",
x3."application_number" AS "citing_application_number",
TO_DATE(
CASE
WHEN x3."filing_date" != 0 THEN TO_CHAR(x3."filing_date")
ELSE NULL
END,
'YYYYMMDD'
) AS "joined_filing_date",
cite.value:"publication_number"::STRING AS "cited_publication_number"
FROM
PATENTS.PATENTS.PUBLICATIONS x3,
LATERAL FLATTEN(INPUT => x3."citation") cite
WHERE
x3."filing_date" != 0
) t3
ON patents_sample."publication_number" = t3."cited_publication_number"
AND t3."joined_filing_date" BETWEEN t2."filing_date" AND DATEADD(YEAR, 10, t2."filing_date")
GROUP BY
patents_sample."publication_number"
)
SELECT
COUNT(*)
FROM
forward_citation
WHERE
"forward_citations" = 1;
WITH patents_sample AS (
SELECT
t1."publication_number" AS publication_number,
claim.value:"text" AS claims_text
FROM
PATENTS.PATENTS.PUBLICATIONS t1,
LATERAL FLATTEN(input => t1."claims_localized") AS claim
WHERE
t1."country_code" = 'US'
AND t1."grant_date" BETWEEN 20080101 AND 20181231
AND t1."grant_date" != 0
AND t1."publication_number" LIKE '%B2%'
),
Publication_data AS (
SELECT
publication_number,
COUNT_IF(claims_text NOT LIKE '%claim%') AS nb_indep_claims
FROM
patents_sample
GROUP BY
publication_number
)
SELECT COUNT(nb_indep_claims)
FROM Publication_data
WHERE nb_indep_claims != 0
WITH interim_table as(
SELECT
t1."publication_number",
SUBSTR(ipc_u.value:"code", 0, 4) as ipc4
FROM
PATENTS.PATENTS.PUBLICATIONS t1,
LATERAL FLATTEN(input => t1."ipc") AS ipc_u
WHERE
"country_code" = 'US'
AND "grant_date" between 20220601 AND 20220831
AND "grant_date" != 0
AND "publication_number" LIKE '%B2%'
GROUP BY
t1."publication_number",
ipc4
)
SELECT
ipc4
FROM
interim_table
GROUP BY ipc4
ORDER BY COUNT("publication_number") DESC
LIMIT 1
WITH patents_sample AS (
SELECT
"publication_number",
"application_number"
FROM
PATENTS_GOOGLE.PATENTS_GOOGLE.PUBLICATIONS
WHERE
"publication_number" = 'US-9741766-B2'
),
flattened_t5 AS (
SELECT
t5."publication_number",
f.value AS element_value,
f.index AS pos
FROM
PATENTS_GOOGLE.PATENTS_GOOGLE.ABS_AND_EMB t5,
LATERAL FLATTEN(input => t5."embedding_v1") AS f
),
flattened_t6 AS (
SELECT
t6."publication_number",
f.value AS element_value,
f.index AS pos
FROM
PATENTS_GOOGLE.PATENTS_GOOGLE.ABS_AND_EMB t6,
LATERAL FLATTEN(input => t6."embedding_v1") AS f
),
similarities AS (
SELECT
t1."publication_number" AS base_publication_number,
t4."publication_number" AS similar_publication_number,
SUM(ft5.element_value * ft6.element_value) AS similarity
FROM
(SELECT * FROM patents_sample LIMIT 1) t1
LEFT JOIN (
SELECT
x3."publication_number",
EXTRACT(YEAR, TO_DATE(CAST(x3."filing_date" AS STRING), 'YYYYMMDD')) AS focal_filing_year
FROM
PATENTS_GOOGLE.PATENTS_GOOGLE.PUBLICATIONS x3
WHERE
x3."filing_date" != 0
) t3 ON t3."publication_number" = t1."publication_number"
LEFT JOIN (
SELECT
x4."publication_number",
EXTRACT(YEAR, TO_DATE(CAST(x4."filing_date" AS STRING), 'YYYYMMDD')) AS filing_year
FROM
PATENTS_GOOGLE.PATENTS_GOOGLE.PUBLICATIONS x4
WHERE
x4."filing_date" != 0
) t4 ON
t4."publication_number" != t1."publication_number"
AND t3.focal_filing_year = t4.filing_year
LEFT JOIN flattened_t5 AS ft5 ON ft5."publication_number" = t1."publication_number"
LEFT JOIN flattened_t6 AS ft6 ON ft6."publication_number" = t4."publication_number"
AND ft5.pos = ft6.pos -- Align vector positions
GROUP BY
t1."publication_number", t4."publication_number"
)
SELECT
s.similar_publication_number,
s.similarity
FROM (
SELECT
s.*,
ROW_NUMBER() OVER (PARTITION BY s.base_publication_number ORDER BY s.similarity DESC) AS seqnum
FROM
similarities s
) s
WHERE
seqnum <= 5;
WITH
MonthlyTotals AS
(
SELECT
TO_CHAR("date", 'YYYY-MM') AS "month",
SUM("volume_sold_gallons") AS "total_monthly_volume"
FROM
IOWA_LIQUOR_SALES.IOWA_LIQUOR_SALES."SALES"
WHERE
"date" >= '2022-01-01'
AND TO_CHAR("date", 'YYYY-MM') < TO_CHAR(CURRENT_DATE(), 'YYYY-MM')
GROUP BY
TO_CHAR("date", 'YYYY-MM')
),
MonthCategory AS
(
SELECT
TO_CHAR("date", 'YYYY-MM') AS "month",
"category",
"category_name",
SUM("volume_sold_gallons") AS "category_monthly_volume",
CASE
WHEN "total_monthly_volume" != 0 THEN (SUM("volume_sold_gallons") / "total_monthly_volume") * 100
ELSE NULL
END AS "category_pct_of_month_volume"
FROM
IOWA_LIQUOR_SALES.IOWA_LIQUOR_SALES."SALES" AS Sales
LEFT JOIN
MonthlyTotals ON TO_CHAR(Sales."date", 'YYYY-MM') = MonthlyTotals."month"
WHERE
Sales."date" >= '2022-01-01'
AND TO_CHAR(Sales."date", 'YYYY-MM') < TO_CHAR(CURRENT_DATE(), 'YYYY-MM')
GROUP BY
TO_CHAR(Sales."date", 'YYYY-MM'), "category", "category_name", "total_monthly_volume"
),
middle_info AS
(
SELECT
Category1."category" AS "category1",
Category1."category_name" AS "category_name1",
Category2."category" AS "category2",
Category2."category_name" AS "category_name2",
COUNT(DISTINCT Category1."month") AS "num_months",
CORR(Category1."category_pct_of_month_volume", Category2."category_pct_of_month_volume") AS "category_corr_across_months",
AVG(Category1."category_pct_of_month_volume") AS "category1_avg_pct_of_month_volume",
AVG(Category2."category_pct_of_month_volume") AS "category2_avg_pct_of_month_volume"
FROM
MonthCategory Category1
INNER JOIN
MonthCategory Category2
ON Category1."month" = Category2."month"
GROUP BY
Category1."category", Category1."category_name", Category2."category", Category2."category_name"
HAVING
"num_months" >= 24
AND "category1_avg_pct_of_month_volume" >= 1
AND "category2_avg_pct_of_month_volume" >= 1
)
SELECT
"category_name1",
"category_name2"
FROM
middle_info
ORDER BY
"category_corr_across_months"
LIMIT 1;
WITH patent_cpcs AS (
SELECT
cd."parents",
CAST(FLOOR("filing_date" / 10000) AS INT) AS "filing_year"
FROM (
SELECT
MAX("cpc") AS "cpc", MAX("filing_date") AS "filing_date"
FROM
PATENTS.PATENTS.PUBLICATIONS
WHERE
"application_number" != ''
GROUP BY
"application_number"
) AS publications
, LATERAL FLATTEN(INPUT => "cpc") AS cpcs
JOIN
PATENTS.PATENTS.CPC_DEFINITION cd ON cd."symbol" = cpcs.value:"code"
WHERE
cpcs.value:"first" = TRUE
AND "filing_date" > 0
),
yearly_counts AS (
SELECT
"cpc_group",
"filing_year",
COUNT(*) AS "cnt"
FROM (
SELECT
cpc_parent.value::STRING AS "cpc_group",
"filing_year"
FROM patent_cpcs,
LATERAL FLATTEN(input => patent_cpcs."parents") AS cpc_parent
)
GROUP BY "cpc_group", "filing_year"
),
ordered_counts AS (
SELECT
"cpc_group",
"filing_year",
"cnt",
ROW_NUMBER() OVER (PARTITION BY "cpc_group" ORDER BY "filing_year" ASC) AS rn
FROM yearly_counts
),
recursive_ema AS (
-- Anchor member: first year per cpc_group
SELECT
"cpc_group",
"filing_year",
"cnt",
"cnt" * 0.2 + 0 * 0.8 AS "ema",
rn
FROM ordered_counts
WHERE rn = 1
UNION ALL
-- Recursive member: subsequent years
SELECT
oc."cpc_group",
oc."filing_year",
oc."cnt",
oc."cnt" * 0.2 + re."ema" * 0.8 AS "ema",
oc.rn
FROM ordered_counts oc
JOIN recursive_ema re
ON oc."cpc_group" = re."cpc_group"
AND oc.rn = re.rn + 1
),
max_ema AS (
SELECT
"cpc_group",
"filing_year",
"ema"
FROM recursive_ema
),
ranked_ema AS (
SELECT
me."cpc_group",
me."filing_year",
me."ema",
ROW_NUMBER() OVER (
PARTITION BY me."cpc_group"
ORDER BY me."ema" DESC, me."filing_year" DESC
) AS rn_rank
FROM max_ema me
)
SELECT
c."titleFull",
REPLACE(r."cpc_group", '"', '') AS "cpc_group",
r."filing_year" AS "best_filing_year"
FROM ranked_ema r
JOIN "PATENTS"."PATENTS"."CPC_DEFINITION" c
ON r."cpc_group" = c."symbol"
WHERE
c."level" = 5
AND r.rn_rank = 1
ORDER BY
c."titleFull",
"cpc_group" ASC;
WITH patent_cpcs AS (
SELECT
cd."parents",
CAST(FLOOR("filing_date" / 10000) AS INT) AS "filing_year"
FROM (
SELECT MAX("cpc") AS "cpc", MAX("filing_date") AS "filing_date"
FROM "PATENTS"."PATENTS"."PUBLICATIONS"
WHERE "application_number" != ''
AND "country_code" = 'DE'
AND "grant_date" >= 20161201
AND "grant_date" <= 20161231
GROUP BY "application_number"
), LATERAL FLATTEN(INPUT => "cpc") AS cpcs
JOIN "PATENTS"."PATENTS"."CPC_DEFINITION" cd ON cd."symbol" = cpcs.value:"code"
WHERE cpcs.value:"first" = TRUE
AND "filing_date" > 0
),
yearly_counts AS (
SELECT
"cpc_group",
"filing_year",
COUNT(*) AS "cnt"
FROM (
SELECT
cpc_parent.VALUE AS "cpc_group", -- Corrected reference to flattened "parents"
"filing_year"
FROM patent_cpcs,
LATERAL FLATTEN(INPUT => "parents") AS cpc_parent -- Corrected reference to flattened "parents"
)
GROUP BY "cpc_group", "filing_year"
),
moving_avg AS (
SELECT
"cpc_group",
"filing_year",
"cnt",
AVG("cnt") OVER (PARTITION BY "cpc_group" ORDER BY "filing_year" ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS "moving_avg"
FROM yearly_counts
)
SELECT
c."titleFull", -- Ensure correct column name (check case)
REPLACE("cpc_group", '"', '') AS "cpc_group",
MAX("filing_year") AS "best_filing_year"
FROM moving_avg
JOIN "PATENTS"."PATENTS"."CPC_DEFINITION" c ON "cpc_group" = c."symbol"
WHERE c."level" = 4
GROUP BY c."titleFull", "cpc_group"
ORDER BY c."titleFull", "cpc_group" ASC;
SELECT
REPLACE(citing_assignee, '"', '') AS citing_assignee,
cpcdef."titleFull" AS cpc_title,
COUNT(*) AS number
FROM (
SELECT
pubs."publication_number" AS citing_publication_number,
cite.value:"publication_number" AS cited_publication_number,
citing_assignee_s.value:"name" AS citing_assignee,
SUBSTR(cpcs.value:"code", 1, 4) AS citing_cpc_subclass
FROM
PATENTS.PATENTS.PUBLICATIONS AS pubs
, LATERAL FLATTEN(input => pubs."citation") AS cite
, LATERAL FLATTEN(input => pubs."assignee_harmonized") AS citing_assignee_s
, LATERAL FLATTEN(input => pubs."cpc") AS cpcs
WHERE
cpcs.value:"first" = TRUE
) AS pubs
JOIN (
SELECT
"publication_number" AS cited_publication_number,
cited_assignee_s.value:"name" AS cited_assignee
FROM
PATENTS.PATENTS.PUBLICATIONS
, LATERAL FLATTEN(input => "assignee_harmonized") AS cited_assignee_s
) AS refs
ON pubs.cited_publication_number = refs.cited_publication_number
JOIN
PATENTS.PATENTS.CPC_DEFINITION AS cpcdef
ON cpcdef."symbol" = pubs.citing_cpc_subclass
WHERE
refs.cited_assignee = 'DENSO CORP'
AND pubs.citing_assignee != 'DENSO CORP'
GROUP BY
citing_assignee, cpcdef."titleFull"
WITH allowed_repos AS (
SELECT
"repo_name",
"license"
FROM
GITHUB_REPOS_DATE.GITHUB_REPOS.LICENSES
WHERE
"license" IN (
'gpl-3.0', 'artistic-2.0', 'isc', 'cc0-1.0', 'epl-1.0', 'gpl-2.0',
'mpl-2.0', 'lgpl-2.1', 'bsd-2-clause', 'apache-2.0', 'mit', 'lgpl-3.0'
)
),
watch_counts AS (
SELECT
TRY_PARSE_JSON("repo"):"name"::STRING AS "repo",
COUNT(DISTINCT TRY_PARSE_JSON("actor"):"login"::STRING) AS "watches"
FROM
GITHUB_REPOS_DATE.MONTH._202204
WHERE
"type" = 'WatchEvent'
GROUP BY
TRY_PARSE_JSON("repo"):"name"
),
issue_counts AS (
SELECT
TRY_PARSE_JSON("repo"):"name"::STRING AS "repo",
COUNT(*) AS "issue_events"
FROM
GITHUB_REPOS_DATE.MONTH._202204
WHERE
"type" = 'IssuesEvent'
GROUP BY
TRY_PARSE_JSON("repo"):"name"
),
fork_counts AS (
SELECT
TRY_PARSE_JSON("repo"):"name"::STRING AS "repo",
COUNT(*) AS "forks"
FROM
GITHUB_REPOS_DATE.MONTH._202204
WHERE
"type" = 'ForkEvent'
GROUP BY
TRY_PARSE_JSON("repo"):"name"
)
SELECT
ar."repo_name"
FROM
allowed_repos AS ar
INNER JOIN
fork_counts AS fc ON ar."repo_name" = fc."repo"
INNER JOIN
issue_counts AS ic ON ar."repo_name" = ic."repo"
INNER JOIN
watch_counts AS wc ON ar."repo_name" = wc."repo"
ORDER BY
(fc."forks" + ic."issue_events" + wc."watches") DESC
LIMIT 1;
WITH extracted_modules AS (
SELECT
el."file_id" AS "file_id",
el."repo_name",
el."path" AS "path_",
REPLACE(line.value, '"', '') AS "line_",
CASE
WHEN ENDSWITH(el."path", '.py') THEN 'python'
WHEN ENDSWITH(el."path", '.r') THEN 'r'
ELSE NULL
END AS "language",
CASE
WHEN ENDSWITH(el."path", '.py') THEN
ARRAY_CAT(
ARRAY_CONSTRUCT(REGEXP_SUBSTR(line.value, '\\bimport\\s+(\\w+)', 1, 1, 'e')),
ARRAY_CONSTRUCT(REGEXP_SUBSTR(line.value, '\\bfrom\\s+(\\w+)', 1, 1, 'e'))
)
WHEN ENDSWITH(el."path", '.r') THEN
ARRAY_CONSTRUCT(REGEXP_SUBSTR(line.value, 'library\\s*\\(\\s*([^\\s)]+)\\s*\\)', 1, 1, 'e'))
ELSE ARRAY_CONSTRUCT()
END AS "modules"
FROM (
SELECT
ct."id" AS "file_id",
fl."repo_name" AS "repo_name",
fl."path",
SPLIT(REPLACE(ct."content", '\n', ' \n'), '\n') AS "lines"
FROM
GITHUB_REPOS_DATE.GITHUB_REPOS.SAMPLE_FILES AS fl
JOIN
GITHUB_REPOS_DATE.GITHUB_REPOS.SAMPLE_CONTENTS AS ct
ON fl."id" = ct."id"
) AS el,
LATERAL FLATTEN(input => el."lines") AS line
WHERE
(
ENDSWITH("path_", '.py')
AND
(
"line_" LIKE 'import %'
OR
"line_" LIKE 'from %'
)
)
OR
(
ENDSWITH("path_", '.r')
AND
"line_" LIKE 'library%('
)
),
module_counts AS (
SELECT
em."language",
f.value::STRING AS "module",
COUNT(*) AS "occurrence_count"
FROM
extracted_modules AS em,
LATERAL FLATTEN(input => em."modules") AS f
WHERE
em."modules" IS NOT NULL
AND f.value IS NOT NULL
GROUP BY
em."language",
f.value
),
python AS (
SELECT
"language",
"module",
"occurrence_count"
FROM
module_counts
WHERE
"language" = 'python'
),
rlanguage AS (
SELECT
"language",
"module",
"occurrence_count"
FROM
module_counts AS mc_inner
WHERE
"language" = 'r'
)
SELECT
*
FROM
python
UNION ALL
SELECT
*
FROM
rlanguage
ORDER BY
"language",
"occurrence_count" DESC;
SELECT
CONCAT("city", ', ', "state_name") AS "city",
"zip_code",
COUNT("event_id") AS "count_storms"
FROM (
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2014
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2015
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2016
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2017
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2018
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2019
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2020
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2021
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2022
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2023
UNION ALL
SELECT *
FROM NOAA_DATA_PLUS.NOAA_HISTORIC_SEVERE_STORMS.STORMS_2024
) AS storms
JOIN NOAA_DATA_PLUS.GEO_US_BOUNDARIES.ZIP_CODES
ON ST_WITHIN(ST_GEOGFROMWKB(storms."event_point"), ST_GEOGFROMWKB("zip_code_geom"))
WHERE
LOWER(storms."event_type") = 'hail'
GROUP BY
"zip_code",
"city",
"state_name"
ORDER BY
"count_storms" DESC
LIMIT 5;
SELECT filterData."fwrdCitations_3"
FROM
PATENTSVIEW.PATENTSVIEW.APPLICATION AS app
JOIN (
SELECT DISTINCT
cpc."patent_id",
IFNULL(citation_3."bkwdCitations_3", 0) AS "bkwdCitations_3",
IFNULL(citation_3."fwrdCitations_3", 0) AS "fwrdCitations_3"
FROM
PATENTSVIEW.PATENTSVIEW.CPC_CURRENT AS cpc
LEFT JOIN (
SELECT
b."patent_id",
b."bkwdCitations_3",
f."fwrdCitations_3"
FROM
(SELECT
cited."patent_id",
COUNT(*) AS "fwrdCitations_3"
FROM
PATENTSVIEW.PATENTSVIEW.USPATENTCITATION AS cited
JOIN
PATENTSVIEW.PATENTSVIEW.APPLICATION AS apps
ON cited."patent_id" = apps."patent_id"
WHERE
apps."country" = 'US'
AND cited."date" >= apps."date"
AND TRY_CAST(cited."date" AS DATE) <= DATEADD(YEAR, 1, TRY_CAST(apps."date" AS DATE)) -- Citation within 1 year
GROUP BY
cited."patent_id"
) AS f
JOIN (
SELECT
cited."patent_id",
COUNT(*) AS "bkwdCitations_3"
FROM
PATENTSVIEW.PATENTSVIEW.USPATENTCITATION AS cited
JOIN
PATENTSVIEW.PATENTSVIEW.APPLICATION AS apps
ON cited."patent_id" = apps."patent_id"
WHERE
apps."country" = 'US'
AND cited."date" < apps."date"
AND TRY_CAST(cited."date" AS DATE) >= DATEADD(YEAR, -1, TRY_CAST(apps."date" AS DATE)) -- Citation within 1 year before
GROUP BY
cited."patent_id"
) AS b
ON b."patent_id" = f."patent_id"
WHERE
b."bkwdCitations_3" IS NOT NULL
AND f."fwrdCitations_3" IS NOT NULL
) AS citation_3
ON cpc."patent_id" = citation_3."patent_id"
) AS filterData
ON app."patent_id" = filterData."patent_id"
ORDER BY filterData."bkwdCitations_3" DESC
LIMIT 1;
WITH requests AS (
SELECT
D."id",
D."content",
E."repo_name",
E."path"
FROM
(
SELECT
"id",
"content"
FROM
GITHUB_REPOS.GITHUB_REPOS.SAMPLE_CONTENTS
GROUP BY
"id", "content"
) AS D
INNER JOIN
(
SELECT
C."id",
C."repo_name",
C."path"
FROM
(
SELECT
"id",
"repo_name",
"path"
FROM
GITHUB_REPOS.GITHUB_REPOS.SAMPLE_FILES
WHERE
LOWER("path") LIKE '%readme.md'
GROUP BY
"path", "id", "repo_name"
) AS C
INNER JOIN
(
SELECT
"repo_name",
language_struct.value:"name"::STRING AS "language_name"
FROM
GITHUB_REPOS.GITHUB_REPOS.LANGUAGES,
LATERAL FLATTEN(input => "language") AS language_struct
WHERE
LOWER(language_struct.value:"name"::STRING) NOT LIKE '%python%'
GROUP BY
"language_name", "repo_name"
) AS F
ON
C."repo_name" = F."repo_name"
) AS E
ON
D."id" = E."id"
)
SELECT
(SELECT COUNT(*) FROM requests WHERE "content" LIKE '%Copyright (c)%') / COUNT(*) AS "proportion"
FROM
requests;
WITH country_name AS (
SELECT 'Singapore' AS value
),
last_updated AS (
SELECT
MAX("last_updated") AS value
FROM GEO_OPENSTREETMAP_WORLDPOP.WORLDPOP.POPULATION_GRID_1KM AS pop
INNER JOIN country_name ON (pop."country_name" = country_name.value)
WHERE "last_updated" < '2023-01-01'
),
aggregated_population AS (
SELECT
"geo_id",
SUM("population") AS sum_population,
ST_POINT("longitude_centroid", "latitude_centroid") AS centr -- 计算每个 geo_id 的中心点
FROM
GEO_OPENSTREETMAP_WORLDPOP.WORLDPOP.POPULATION_GRID_1KM AS pop
INNER JOIN country_name ON (pop."country_name" = country_name.value)
INNER JOIN last_updated ON (pop."last_updated" = last_updated.value)
GROUP BY "geo_id", "longitude_centroid", "latitude_centroid"
),
population AS (
SELECT
SUM(sum_population) AS sum_population,
ST_ENVELOPE(ST_UNION_AGG(centr)) AS boundingbox -- 使用 ST_ENVELOPE 来代替 ST_CONVEXHULL
FROM aggregated_population
),
hospitals AS (
SELECT
layer."geometry"
FROM
GEO_OPENSTREETMAP_WORLDPOP.GEO_OPENSTREETMAP.PLANET_LAYERS AS layer
INNER JOIN population ON ST_INTERSECTS(population.boundingbox, ST_GEOGFROMWKB(layer."geometry"))
WHERE
layer."layer_code" IN (2110, 2120)
),
distances AS (
SELECT
pop."geo_id",
pop."population",
MIN(ST_DISTANCE(ST_GEOGFROMWKB(pop."geog"), ST_GEOGFROMWKB(hospitals."geometry"))) AS distance
FROM
GEO_OPENSTREETMAP_WORLDPOP.WORLDPOP.POPULATION_GRID_1KM AS pop
INNER JOIN country_name ON pop."country_name" = country_name.value
INNER JOIN last_updated ON pop."last_updated" = last_updated.value
CROSS JOIN hospitals
WHERE pop."population" > 0
GROUP BY "geo_id", "population"
)
SELECT
SUM(pd."population") AS population
FROM
distances pd
CROSS JOIN population p
GROUP BY distance
ORDER BY distance DESC
LIMIT 1;
WITH selected_repos AS (
SELECT
f."id",
f."repo_name" AS "repo_name",
f."path" AS "path"
FROM
GITHUB_REPOS.GITHUB_REPOS.SAMPLE_FILES AS f
),
deduped_files AS (
SELECT
f."id",
MIN(f."repo_name") AS "repo_name",
MIN(f."path") AS "path"
FROM
selected_repos AS f
GROUP BY
f."id"
)
SELECT
f."repo_name"
FROM
deduped_files AS f
JOIN GITHUB_REPOS.GITHUB_REPOS.SAMPLE_CONTENTS AS c
ON f."id" = c."id"
WHERE
NOT c."binary"
AND f."path" LIKE '%.swift'
ORDER BY c."copies" DESC
LIMIT 1;
WITH bounding_area AS (
SELECT "geometry" AS geometry
FROM GEO_OPENSTREETMAP.GEO_OPENSTREETMAP.PLANET_FEATURES,
LATERAL FLATTEN(INPUT => "all_tags") AS tag
WHERE "feature_type" = 'multipolygons'
AND tag.value:"key" = 'wikidata'
AND tag.value:"value" = 'Q191'
),
bounding_area_features AS (
SELECT
planet_features."osm_id",
planet_features."feature_type",
planet_features."geometry",
planet_features."all_tags"
FROM GEO_OPENSTREETMAP.GEO_OPENSTREETMAP.PLANET_FEATURES AS planet_features,
bounding_area
WHERE ST_DWITHIN(
ST_GEOGFROMWKB(planet_features."geometry"),
ST_GEOGFROMWKB(bounding_area.geometry),
0.0
)
),
osm_id_with_wikidata AS (
SELECT DISTINCT
baf."osm_id"
FROM bounding_area_features AS baf,
LATERAL FLATTEN(INPUT => baf."all_tags") AS tag
WHERE tag.value:"key" = 'wikidata'
),
polygons_wo_wikidata AS (
SELECT
baf."osm_id",
tag.value:"value" as name,
baf."geometry" as geometry
FROM bounding_area_features AS baf
LEFT JOIN osm_id_with_wikidata AS wd
ON baf."osm_id" = wd."osm_id",
LATERAL FLATTEN(INPUT => "all_tags") AS tag
WHERE wd."osm_id" IS NULL
AND baf."osm_id" IS NOT NULL
AND baf."feature_type" = 'multipolygons'
AND tag.value:"key" = 'name'
)
SELECT
TRIM(pww.name) as name
FROM bounding_area_features AS baf
JOIN polygons_wo_wikidata AS pww
ON ST_DWITHIN(
ST_GEOGFROMWKB(baf."geometry"),
ST_GEOGFROMWKB(pww.geometry),
0.0
)
LEFT JOIN osm_id_with_wikidata AS wd
ON baf."osm_id" = wd."osm_id"
WHERE wd."osm_id" IS NOT NULL
AND baf."feature_type" = 'points'
GROUP BY pww.name
ORDER BY COUNT(baf."osm_id") DESC
LIMIT 2
SELECT
COUNT(commits_table."message") AS "num_messages"
FROM (
SELECT
L."repo_name",
language_struct.value:"name"::STRING AS "language_name"
FROM
GITHUB_REPOS.GITHUB_REPOS.LANGUAGES AS L,
LATERAL FLATTEN(input => L."language") AS language_struct
) AS lang_table
JOIN
GITHUB_REPOS.GITHUB_REPOS.LICENSES AS license_table
ON
license_table."repo_name" = lang_table."repo_name"
JOIN (
SELECT
*
FROM
GITHUB_REPOS.GITHUB_REPOS.SAMPLE_COMMITS
) AS commits_table
ON
commits_table."repo_name" = lang_table."repo_name"
WHERE
license_table."license" LIKE 'apache-2.0'
AND lang_table."language_name" LIKE 'Shell'
AND LENGTH(commits_table."message") > 5
AND LENGTH(commits_table."message") < 10000
AND LOWER(commits_table."message") NOT LIKE 'update%'
AND LOWER(commits_table."message") NOT LIKE 'test%'
AND LOWER(commits_table."message") NOT LIKE 'merge%';
WITH filtered_users AS (
SELECT
"first_name",
"last_name",
"gender",
"age",
CAST(TO_TIMESTAMP("created_at" / 1000000.0) AS DATE) AS "created_at"
FROM
"THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS"
WHERE
CAST(TO_TIMESTAMP("created_at" / 1000000.0) AS DATE) BETWEEN '2019-01-01' AND '2022-04-30'
),
youngest_ages AS (
SELECT
"gender",
MIN("age") AS "age"
FROM
filtered_users
GROUP BY
"gender"
),
oldest_ages AS (
SELECT
"gender",
MAX("age") AS "age"
FROM
filtered_users
GROUP BY
"gender"
),
youngest_oldest AS (
SELECT
u."first_name",
u."last_name",
u."gender",
u."age",
'youngest' AS "tag"
FROM
filtered_users u
JOIN
youngest_ages y
ON
u."gender" = y."gender" AND u."age" = y."age"
UNION ALL
SELECT
u."first_name",
u."last_name",
u."gender",
u."age",
'oldest' AS "tag"
FROM
filtered_users u
JOIN
oldest_ages o
ON
u."gender" = o."gender" AND u."age" = o."age"
)
SELECT
"tag",
"gender",
COUNT(*) AS "num"
FROM
youngest_oldest
GROUP BY
"tag", "gender"
ORDER BY
"tag", "gender";
WITH d AS (
SELECT
a."order_id",
TO_CHAR(TO_TIMESTAMP(a."created_at" / 1000000.0), 'YYYY-MM') AS "month", -- 格式化为年月
TO_CHAR(TO_TIMESTAMP(a."created_at" / 1000000.0), 'YYYY') AS "year", -- 格式化为年份
b."product_id", b."sale_price", c."category", c."cost"
FROM
"THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."ORDERS" AS a
JOIN
"THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."ORDER_ITEMS" AS b
ON a."order_id" = b."order_id"
JOIN
"THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."PRODUCTS" AS c
ON b."product_id" = c."id"
WHERE
a."status" = 'Complete'
AND TO_TIMESTAMP(a."created_at" / 1000000.0) BETWEEN TO_TIMESTAMP('2023-01-01') AND TO_TIMESTAMP('2023-12-31')
AND c."category" = 'Sleep & Lounge'
),
e AS (
SELECT
"month",
"year",
"sale_price",
"category",
"cost",
SUM("sale_price") OVER (PARTITION BY "month", "category") AS "TPV",
SUM("cost") OVER (PARTITION BY "month", "category") AS "total_cost",
COUNT(DISTINCT "order_id") OVER (PARTITION BY "month", "category") AS "TPO",
SUM("sale_price" - "cost") OVER (PARTITION BY "month", "category") AS "total_profit",
SUM(("sale_price" - "cost") / "cost") OVER (PARTITION BY "month", "category") AS "Profit_to_cost_ratio"
FROM
d
)
SELECT DISTINCT
"month",
"category",
"TPV",
"total_cost",
"TPO",
"total_profit",
"Profit_to_cost_ratio"
FROM
e
ORDER BY
"month";
WITH youngest AS (
SELECT
"gender",
"id",
"first_name",
"last_name",
"age",
'youngest' AS "tag"
FROM
"THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS"
WHERE
"age" = (SELECT MIN("age") FROM "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS")
AND TO_TIMESTAMP("created_at" / 1000000.0) BETWEEN TO_TIMESTAMP('2019-01-01') AND TO_TIMESTAMP('2022-04-30')
GROUP BY
"gender", "id", "first_name", "last_name", "age"
ORDER BY
"gender"
),
oldest AS (
SELECT
"gender",
"id",
"first_name",
"last_name",
"age",
'oldest' AS "tag"
FROM
"THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS"
WHERE
"age" = (SELECT MAX("age") FROM "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS")
AND TO_TIMESTAMP("created_at" / 1000000.0) BETWEEN TO_TIMESTAMP('2019-01-01') AND TO_TIMESTAMP('2022-04-30')
GROUP BY
"gender", "id", "first_name", "last_name", "age"
ORDER BY
"gender"
),
TEMP_record AS (
SELECT * FROM youngest
UNION ALL
SELECT * FROM oldest
)
SELECT
SUM(CASE WHEN "age" = (SELECT MAX("age") FROM "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS") THEN 1 END) -
SUM(CASE WHEN "age" = (SELECT MIN("age") FROM "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS") THEN 1 END) AS "diff"
FROM
TEMP_record;
WITH
main AS (
SELECT
"id" AS "user_id",
"email",
"gender",
"country",
"traffic_source"
FROM
"THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS"
WHERE
TO_TIMESTAMP("created_at" / 1000000.0) BETWEEN TO_TIMESTAMP('2019-01-01') AND TO_TIMESTAMP('2019-12-31')
),
daate AS (
SELECT
"user_id",
"order_id",
CAST(TO_TIMESTAMP("created_at" / 1000000.0) AS DATE) AS "order_date",
"num_of_item"
FROM
"THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."ORDERS"
WHERE
TO_TIMESTAMP("created_at" / 1000000.0) BETWEEN TO_TIMESTAMP('2019-01-01') AND TO_TIMESTAMP('2019-12-31')
),
orders AS (
SELECT
"user_id",
"order_id",
"product_id",
"sale_price",
"status"
FROM
"THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."ORDER_ITEMS"
WHERE
TO_TIMESTAMP("created_at" / 1000000.0) BETWEEN TO_TIMESTAMP('2019-01-01') AND TO_TIMESTAMP('2019-12-31')
),
nest AS (
SELECT
o."user_id",
o."order_id",
o."product_id",
d."order_date",
d."num_of_item",
ROUND(o."sale_price", 2) AS "sale_price",
ROUND(d."num_of_item" * o."sale_price", 2) AS "total_sale"
FROM
orders o
INNER JOIN
daate d
ON
o."order_id" = d."order_id"
ORDER BY
o."user_id"
),
type AS (
SELECT
"user_id",
MIN(nest."order_date") AS "cohort_date",
MAX(nest."order_date") AS "latest_shopping_date",
DATEDIFF(MONTH, MIN(nest."order_date"), MAX(nest."order_date")) AS "lifespan_months",
ROUND(SUM("total_sale"), 2) AS "ltv",
COUNT("order_id") AS "no_of_order"
FROM
nest
GROUP BY
"user_id"
),
kite AS (
SELECT
m."user_id",
m."email",
m."gender",
m."country",
m."traffic_source",
EXTRACT(YEAR FROM n."cohort_date") AS "cohort_year",
n."latest_shopping_date",
n."lifespan_months",
n."ltv",
n."no_of_order",
ROUND(n."ltv" / n."no_of_order", 2) AS "avg_order_value"
FROM
main m
INNER JOIN
type n
ON
m."user_id" = n."user_id"
)
SELECT
"email"
FROM
kite
ORDER BY
"avg_order_value" DESC
LIMIT 10;
WITH
orders_x_order_items AS (
SELECT orders.*,
order_items."inventory_item_id",
order_items."sale_price"
FROM "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."ORDERS" AS orders
LEFT JOIN "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."ORDER_ITEMS" AS order_items
ON orders."order_id" = order_items."order_id"
WHERE TO_TIMESTAMP_NTZ(orders."created_at" / 1000000) BETWEEN TO_TIMESTAMP_NTZ('2021-01-01') AND TO_TIMESTAMP_NTZ('2021-12-31')
),
orders_x_inventory AS (
SELECT orders_x_order_items.*,
inventory_items."product_category",
inventory_items."product_department",
inventory_items."product_retail_price",
inventory_items."product_distribution_center_id",
inventory_items."cost",
distribution_centers."name"
FROM orders_x_order_items
LEFT JOIN "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."INVENTORY_ITEMS" AS inventory_items
ON orders_x_order_items."inventory_item_id" = inventory_items."id"
LEFT JOIN "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."DISTRIBUTION_CENTERS" AS distribution_centers
ON inventory_items."product_distribution_center_id" = distribution_centers."id"
WHERE TO_TIMESTAMP_NTZ(inventory_items."created_at" / 1000000) BETWEEN TO_TIMESTAMP_NTZ('2021-01-01') AND TO_TIMESTAMP_NTZ('2021-12-31')
),
orders_x_users AS (
SELECT orders_x_inventory.*,
users."country" AS "users_country"
FROM orders_x_inventory
LEFT JOIN "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS" AS users
ON orders_x_inventory."user_id" = users."id"
WHERE TO_TIMESTAMP_NTZ(users."created_at" / 1000000) BETWEEN TO_TIMESTAMP_NTZ('2021-01-01') AND TO_TIMESTAMP_NTZ('2021-12-31')
)
SELECT
DATE_TRUNC('MONTH', TO_DATE(TO_TIMESTAMP_NTZ(orders_x_users."created_at" / 1000000))) AS "reporting_month",
orders_x_users."users_country",
orders_x_users."product_department",
orders_x_users."product_category",
COUNT(DISTINCT orders_x_users."order_id") AS "n_order",
COUNT(DISTINCT orders_x_users."user_id") AS "n_purchasers",
SUM(orders_x_users."product_retail_price") - SUM(orders_x_users."cost") AS "profit"
FROM orders_x_users
GROUP BY 1, 2, 3, 4
ORDER BY "reporting_month";
WITH
orders AS (
SELECT
"order_id",
"user_id",
"created_at",
DATE_TRUNC('MONTH', TO_TIMESTAMP_NTZ("delivered_at" / 1000000)) AS "delivery_month", -- Converting to timestamp
"status"
FROM "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."ORDERS"
),
order_items AS (
SELECT
"order_id",
"product_id",
"sale_price"
FROM "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."ORDER_ITEMS"
),
products AS (
SELECT
"id",
"cost"
FROM "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."PRODUCTS"
),
users AS (
SELECT
"id",
"traffic_source"
FROM "THELOOK_ECOMMERCE"."THELOOK_ECOMMERCE"."USERS"
),
filter_join AS (
SELECT
orders."order_id",
orders."user_id",
order_items."product_id",
orders."delivery_month",
orders."status",
order_items."sale_price",
products."cost",
users."traffic_source"
FROM orders
JOIN order_items ON orders."order_id" = order_items."order_id"
JOIN products ON order_items."product_id" = products."id"
JOIN users ON orders."user_id" = users."id"
WHERE orders."status" = 'Complete'
AND users."traffic_source" = 'Facebook'
AND TO_TIMESTAMP_NTZ(orders."created_at" / 1000000) BETWEEN TO_TIMESTAMP_NTZ('2022-07-01') AND TO_TIMESTAMP_NTZ('2023-11-30') -- Include July for calculation
),
monthly_sales AS (
SELECT
"delivery_month",
"traffic_source",
SUM("sale_price") AS "total_revenue",
SUM("sale_price") - SUM("cost") AS "total_profit",
COUNT(DISTINCT "product_id") AS "product_quantity",
COUNT(DISTINCT "order_id") AS "orders_quantity",
COUNT(DISTINCT "user_id") AS "users_quantity"
FROM filter_join
GROUP BY "delivery_month", "traffic_source"
)
-- Filter to show only 8th month and onwards, but calculate using July
SELECT
current_month."delivery_month",
COALESCE(
current_month."total_profit" - previous_month."total_profit",
0 -- If there is no previous month (i.e. for 8月), return 0
) AS "profit_vs_prior_month"
FROM monthly_sales AS current_month
LEFT JOIN monthly_sales AS previous_month
ON current_month."traffic_source" = previous_month."traffic_source"
AND current_month."delivery_month" = DATEADD(MONTH, -1, previous_month."delivery_month") -- Correctly join to previous month
WHERE current_month."delivery_month" >= '2022-08-01' -- Only show August and later data, but use July for calculation
ORDER BY "profit_vs_prior_month" DESC
LIMIT 5;
WITH philadelphia AS (
SELECT
*
FROM
GEO_OPENSTREETMAP_CENSUS_PLACES.GEO_US_CENSUS_PLACES.PLACES_PENNSYLVANIA
WHERE
"place_name" = 'Philadelphia'
),
amenities AS (
SELECT
features.*,
tags.value:"value" AS amenity
FROM
GEO_OPENSTREETMAP_CENSUS_PLACES.GEO_OPENSTREETMAP.PLANET_FEATURES_POINTS AS features
CROSS JOIN philadelphia
-- Use FLATTEN on "all_tags" to get the tags and filter by "key"
, LATERAL FLATTEN(input => features."all_tags") AS tags
WHERE
ST_CONTAINS(ST_GEOGFROMWKB(philadelphia."place_geom"), ST_GEOGFROMWKB(features."geometry"))
AND
tags.value:"key" = 'amenity'
AND
tags.value:"value" IN ('library', 'place_of_worship', 'community_centre')
),
joiin AS (
SELECT
a1.*,
a2."osm_id" AS nearest_osm_id,
ST_DISTANCE(ST_GEOGFROMWKB(a1."geometry"), ST_GEOGFROMWKB(a2."geometry")) AS distance,
ROW_NUMBER() OVER (PARTITION BY a1."osm_id" ORDER BY ST_DISTANCE(ST_GEOGFROMWKB(a1."geometry"), ST_GEOGFROMWKB(a2."geometry"))) AS row_num
FROM amenities a1
CROSS JOIN amenities a2
WHERE a1."osm_id" < a2."osm_id"
ORDER BY a1."osm_id", distance
)
SELECT distance
FROM joiin
WHERE row_num = 1
ORDER BY distance ASC
LIMIT 1;
WITH daily_forecasts AS (
SELECT
"TRI"."creation_time",
CAST(DATEADD(hour, 1, TO_TIMESTAMP_NTZ(TO_NUMBER("forecast".value:"time") / 1000000)) AS DATE) AS "local_forecast_date",
MAX(
CASE
WHEN "forecast".value:"temperature_2m_above_ground" IS NOT NULL
THEN "forecast".value:"temperature_2m_above_ground"
ELSE NULL
END
) AS "max_temp",
MIN(
CASE
WHEN "forecast".value:"temperature_2m_above_ground" IS NOT NULL
THEN "forecast".value:"temperature_2m_above_ground"
ELSE NULL
END
) AS "min_temp",
AVG(
CASE
WHEN "forecast".value:"temperature_2m_above_ground" IS NOT NULL
THEN "forecast".value:"temperature_2m_above_ground"
ELSE NULL
END
) AS "avg_temp",
SUM(
CASE
WHEN "forecast".value:"total_precipitation_surface" IS NOT NULL
THEN "forecast".value:"total_precipitation_surface"
ELSE 0
END
) AS "total_precipitation",
AVG(
CASE
WHEN CAST(DATEADD(hour, 1, TO_TIMESTAMP_NTZ(TO_NUMBER("forecast".value:"time") / 1000000) ) AS TIME) BETWEEN '10:00:00' AND '17:00:00'
AND "forecast".value:"total_cloud_cover_entire_atmosphere" IS NOT NULL
THEN "forecast".value:"total_cloud_cover_entire_atmosphere"
ELSE NULL
END
) AS "avg_cloud_cover",
CASE
WHEN AVG("forecast".value:"temperature_2m_above_ground") < 32 THEN
SUM(
CASE
WHEN "forecast".value:"total_precipitation_surface" IS NOT NULL
THEN "forecast".value:"total_precipitation_surface"
ELSE 0
END
)
ELSE 0
END AS "total_snow",
CASE
WHEN AVG("forecast".value:"temperature_2m_above_ground") >= 32 THEN
SUM(
CASE
WHEN "forecast".value:"total_precipitation_surface" IS NOT NULL
THEN "forecast".value:"total_precipitation_surface"
ELSE 0
END
)
ELSE 0
END AS "total_rain"
FROM
"NOAA_GLOBAL_FORECAST_SYSTEM"."NOAA_GLOBAL_FORECAST_SYSTEM"."NOAA_GFS0P25" AS "TRI"
CROSS JOIN LATERAL FLATTEN(input => "TRI"."forecast") AS "forecast"
WHERE
TO_TIMESTAMP_NTZ(TO_NUMBER("TRI"."creation_time") / 1000000) BETWEEN '2019-07-01' AND '2021-07-31'
AND ST_DWITHIN(
ST_GEOGFROMWKB("TRI"."geography"),
ST_POINT(26.75, 51.5),
5000
)
AND CAST(TO_TIMESTAMP_NTZ(TO_NUMBER("forecast".value:"time") / 1000000) AS DATE) = DATEADD(day, 1, CAST( TO_TIMESTAMP_NTZ(TO_NUMBER("TRI"."creation_time") / 1000000) AS DATE))
GROUP BY
"TRI"."creation_time",
"local_forecast_date"
)
SELECT
TO_TIMESTAMP_NTZ(TO_NUMBER("creation_time") / 1000000),
"local_forecast_date" AS "forecast_date",
"max_temp",
"min_temp",
"avg_temp",
"total_precipitation",
"avg_cloud_cover",
"total_snow",
"total_rain"
FROM
daily_forecasts
ORDER BY
"creation_time",
"forecast_date";
SELECT
"trip_id",
"duration_sec",
DATE(TO_TIMESTAMP_LTZ("start_date" / 1000000)) AS "star_date",
"start_station_name",
CONCAT("start_station_name", ' - ', "end_station_name") AS "route",
"bike_number",
"subscriber_type",
"member_birth_year",
(EXTRACT(YEAR FROM CURRENT_DATE()) - "member_birth_year") AS "age",
CASE
WHEN (EXTRACT(YEAR FROM CURRENT_DATE()) - "member_birth_year") < 40 THEN 'Young (<40 Y.O)'
WHEN (EXTRACT(YEAR FROM CURRENT_DATE()) - "member_birth_year") BETWEEN 40 AND 60 THEN 'Adult (40-60 Y.O)'
ELSE 'Senior Adult (>60 Y.O)'
END AS "age_class",
"member_gender",
c."name" AS "region_name"
FROM "SAN_FRANCISCO_PLUS"."SAN_FRANCISCO_BIKESHARE"."BIKESHARE_TRIPS" a
LEFT JOIN "SAN_FRANCISCO_PLUS"."SAN_FRANCISCO_BIKESHARE"."BIKESHARE_STATION_INFO" b
ON a."start_station_id" = b."station_id"
LEFT JOIN "SAN_FRANCISCO_PLUS"."SAN_FRANCISCO_BIKESHARE"."BIKESHARE_REGIONS" c
ON b."region_id" = c."region_id"
WHERE TO_TIMESTAMP_LTZ("start_date" / 1000000) BETWEEN '2017-07-01' AND '2017-12-31'
AND b."station_id" IS NOT NULL
AND "member_birth_year" IS NOT NULL
AND "member_gender" IS NOT NULL
ORDER BY "duration_sec" DESC
LIMIT 5;
WITH watched_repos AS (
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201701
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201702
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201703
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201704
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201705
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201706
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201707
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201708
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201709
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201710
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201711
WHERE
"type" = 'WatchEvent'
UNION ALL
SELECT
PARSE_JSON("repo"):"name"::STRING AS "repo"
FROM
GITHUB_REPOS_DATE.MONTH._201712
WHERE
"type" = 'WatchEvent'
),
repo_watch_counts AS (
SELECT
"repo",
COUNT(*) AS "watch_count"
FROM
watched_repos
GROUP BY
"repo"
)
SELECT
REPLACE(r."repo", '"', '') AS "repo",
r."watch_count"
FROM
GITHUB_REPOS_DATE.GITHUB_REPOS.SAMPLE_FILES AS f
JOIN
GITHUB_REPOS_DATE.GITHUB_REPOS.SAMPLE_CONTENTS AS c
ON f."id" = c."id"
JOIN
repo_watch_counts AS r
ON f."repo_name" = r."repo"
WHERE
f."path" LIKE '%.py'
AND c."size" < 15000
AND POSITION('def ' IN c."content") > 0
GROUP BY
r."repo", r."watch_count"
ORDER BY
r."watch_count" DESC
LIMIT
3;
SELECT
COUNT(*) AS "total_count"
FROM
IDC.IDC_V17.DICOM_PIVOT AS "dicom_pivot"
WHERE
"StudyInstanceUID" IN (
SELECT
"StudyInstanceUID"
FROM
IDC.IDC_V17.DICOM_PIVOT AS "dicom_pivot"
WHERE
"StudyInstanceUID" IN (
SELECT
"StudyInstanceUID"
FROM
IDC.IDC_V17.DICOM_PIVOT AS "dicom_pivot"
WHERE
LOWER("dicom_pivot"."SegmentedPropertyTypeCodeSequence") LIKE LOWER('15825003')
GROUP BY
"StudyInstanceUID"
INTERSECT
SELECT
"StudyInstanceUID"
FROM
IDC.IDC_V17.DICOM_PIVOT AS "dicom_pivot"
WHERE
"dicom_pivot"."collection_id" IN ('Community', 'nsclc_radiomics')
GROUP BY
"StudyInstanceUID"
)
GROUP BY
"StudyInstanceUID"
);
WITH relevant_series AS (
SELECT
DISTINCT "StudyInstanceUID"
FROM
IDC.IDC_V17.DICOM_ALL
WHERE
"collection_id" = 'qin_prostate_repeatability'
AND "SeriesDescription" IN (
'DWI',
'T2 Weighted Axial',
'Apparent Diffusion Coefficient',
'T2 Weighted Axial Segmentations',
'Apparent Diffusion Coefficient Segmentations'
)
),
t2_seg_lesion_series AS (
SELECT
DISTINCT "StudyInstanceUID"
FROM
IDC.IDC_V17.DICOM_ALL
CROSS JOIN LATERAL FLATTEN(input => "SegmentSequence") AS segSeq
WHERE
"collection_id" = 'qin_prostate_repeatability'
AND "SeriesDescription" = 'T2 Weighted Axial Segmentations'
)
SELECT
COUNT(DISTINCT "StudyInstanceUID") AS "total_count"
FROM (
SELECT
"StudyInstanceUID"
FROM relevant_series
UNION ALL
SELECT
"StudyInstanceUID"
FROM t2_seg_lesion_series
);
WITH all_transactions AS (
SELECT
TO_TIMESTAMP_NTZ("block_timestamp" / 1000000) AS "timestamp", -- 将时间戳转换为日期时间格式
"value",
'input' AS "type"
FROM
"CRYPTO"."CRYPTO_BITCOIN"."INPUTS"
UNION ALL
SELECT
TO_TIMESTAMP_NTZ("block_timestamp" / 1000000) AS "timestamp", -- 将时间戳转换为日期时间格式
"value",
'output' AS "type"
FROM
"CRYPTO"."CRYPTO_BITCOIN"."OUTPUTS"
),
filtered_transactions AS (
SELECT
EXTRACT(YEAR FROM "timestamp") AS "year",
"value"
FROM
all_transactions
WHERE "type" = 'output'
),
average_output_values AS (
SELECT
"year",
AVG("value") AS "avg_value"
FROM
filtered_transactions
GROUP BY "year"
),
average_transaction_values AS (
SELECT
EXTRACT(YEAR FROM TO_TIMESTAMP_NTZ("block_timestamp" / 1000000)) AS "year", -- 同样转换时间戳
AVG("output_value") AS "avg_transaction_value"
FROM
"CRYPTO"."CRYPTO_BITCOIN"."TRANSACTIONS"
GROUP BY "year"
ORDER BY "year"
),
common_years AS (
SELECT
ao."year",
ao."avg_value" AS "avg_output_value",
atv."avg_transaction_value"
FROM
average_output_values ao
JOIN
average_transaction_values atv
ON ao."year" = atv."year"
)
SELECT
"year",
"avg_transaction_value" - "avg_output_value" AS "difference"
FROM
common_years
ORDER BY
"year";
WITH transaction_addresses AS (
SELECT
"from_address",
"to_address",
CAST("value" AS NUMERIC) / 1000000 AS "value"
FROM
"CRYPTO"."CRYPTO_ETHEREUM"."TOKEN_TRANSFERS"
WHERE
"token_address" = '0xa92a861fc11b99b24296af880011b47f9cafb5ab'
),
out_addresses AS (
SELECT
"from_address",
SUM(-1 * "value") AS "total_value"
FROM
transaction_addresses
GROUP BY
"from_address"
),
in_addresses AS (
SELECT
"to_address",
SUM("value") AS "total_value"
FROM
transaction_addresses
GROUP BY
"to_address"
),
all_addresses AS (
SELECT
"from_address" AS "address",
"total_value"
FROM
out_addresses
UNION ALL
SELECT
"to_address" AS "address",
"total_value"
FROM
in_addresses
)
SELECT
"address"
FROM
all_addresses
GROUP BY
"address"
HAVING
SUM("total_value") > 0
ORDER BY
SUM("total_value") ASC
LIMIT 3;
WITH seg_rtstruct AS (
SELECT
"collection_id",
"StudyInstanceUID",
"SeriesInstanceUID",
CONCAT('https://viewer.imaging.datacommons.cancer.gov/viewer/', "StudyInstanceUID") AS "viewer_url",
"instance_size"
FROM
"IDC"."IDC_V17"."DICOM_ALL"
WHERE
"Modality" IN ('SEG', 'RTSTRUCT')
AND "SOPClassUID" = '1.2.840.10008.5.1.4.1.1.66.4'
AND ARRAY_SIZE("ReferencedSeriesSequence") = 0
AND ARRAY_SIZE("ReferencedImageSequence") = 0
AND ARRAY_SIZE("SourceImageSequence") = 0
)
SELECT
seg_rtstruct."collection_id",
seg_rtstruct."SeriesInstanceUID",
seg_rtstruct."StudyInstanceUID",
seg_rtstruct."viewer_url",
SUM(seg_rtstruct."instance_size") / 1024 AS "collection_size_KB"
FROM
seg_rtstruct
GROUP BY
seg_rtstruct."collection_id",
seg_rtstruct."SeriesInstanceUID",
seg_rtstruct."StudyInstanceUID",
seg_rtstruct."viewer_url"
ORDER BY
"collection_size_KB" DESC;
WITH
sampled_sops AS (
SELECT
"collection_id",
"SeriesDescription",
"SeriesInstanceUID",
"SOPInstanceUID" AS "seg_SOPInstanceUID",
COALESCE(
"ReferencedSeriesSequence"[0]."ReferencedInstanceSequence"[0]."ReferencedSOPInstanceUID",
"ReferencedImageSequence"[0]."ReferencedSOPInstanceUID",
"SourceImageSequence"[0]."ReferencedSOPInstanceUID"
) AS "referenced_sop"
FROM
"IDC"."IDC_V17"."DICOM_ALL"
WHERE
"Modality" = 'SEG'
AND "SOPClassUID" = '1.2.840.10008.5.1.4.1.1.66.4'
AND "access" = 'Public'
),
segmentations_data AS (
SELECT
dicom_all."collection_id",
dicom_all."PatientID",
dicom_all."SOPInstanceUID",
REPLACE(segmentations."SegmentedPropertyCategory":CodeMeaning::STRING, '"', '') AS "segmentation_category",
REPLACE(segmentations."SegmentedPropertyType":CodeMeaning::STRING, '"', '') AS "segmentation_type"
FROM
sampled_sops
JOIN
"IDC"."IDC_V17"."DICOM_ALL" AS dicom_all
ON
sampled_sops."referenced_sop" = dicom_all."SOPInstanceUID"
JOIN
"IDC"."IDC_V17"."SEGMENTATIONS" AS segmentations
ON
segmentations."SOPInstanceUID" = sampled_sops."seg_SOPInstanceUID"
)
SELECT
"segmentation_category",
COUNT(*) AS "count_"
FROM
segmentations_data
GROUP BY
"segmentation_category"
ORDER BY
"count_" DESC
LIMIT 5;
WITH union_mr_seg AS (
SELECT
"dicom_all_mr"."SOPInstanceUID",
'' AS "segPropertyTypeCodeMeaning",
'' AS "segPropertyCategoryCodeMeaning"
FROM
"IDC"."IDC_V17"."DICOM_ALL" AS "dicom_all_mr"
WHERE
"dicom_all_mr"."SeriesInstanceUID" IN ('1.3.6.1.4.1.14519.5.2.1.3671.4754.105976129314091491952445656147')
UNION ALL
SELECT
"dicom_all_seg"."SOPInstanceUID",
"segmentations"."SegmentedPropertyType":"CodeMeaning" AS "segPropertyTypeCodeMeaning",
"segmentations"."SegmentedPropertyCategory":"CodeMeaning" AS "segPropertyCategoryCodeMeaning"
FROM
"IDC"."IDC_V17"."DICOM_ALL" AS "dicom_all_seg"
JOIN
"IDC"."IDC_V17"."SEGMENTATIONS" AS "segmentations"
ON
"dicom_all_seg"."SOPInstanceUID" = "segmentations"."SOPInstanceUID"
)
SELECT
"dc_all"."Modality",
COUNT(*) AS "count_"
FROM
"IDC"."IDC_V17"."DICOM_ALL" AS "dc_all"
INNER JOIN
union_mr_seg
ON
"dc_all"."SOPInstanceUID" = union_mr_seg."SOPInstanceUID"
GROUP BY
"dc_all"."Modality"
ORDER BY
"count_" DESC
LIMIT 1;
WITH bounding_area AS (
SELECT
"osm_id",
"geometry" AS geometry,
ST_AREA(ST_GEOGRAPHYFROMWKB("geometry")) AS area
FROM GEO_OPENSTREETMAP.GEO_OPENSTREETMAP.PLANET_FEATURES,
LATERAL FLATTEN(INPUT => PLANET_FEATURES."all_tags") AS "tag"
WHERE
"feature_type" = 'multipolygons'
AND "tag".value:"key" = 'boundary'
AND "tag".value:"value" = 'administrative'
),
poi AS (
SELECT
nodes."id" AS poi_id,
nodes."geometry" AS poi_geometry,
tags.value:"value" AS poitype
FROM GEO_OPENSTREETMAP.GEO_OPENSTREETMAP.PLANET_NODES AS nodes,
LATERAL FLATTEN(INPUT => nodes."all_tags") AS tags
WHERE tags.value:"key" = 'amenity'
),
poi_counts AS (
SELECT
ba."osm_id",
COUNT(poi.poi_id) AS total_pois
FROM bounding_area ba
JOIN poi
ON ST_DWITHIN(
ST_GEOGRAPHYFROMWKB(ba.geometry),
ST_GEOGRAPHYFROMWKB(poi.poi_geometry),
0.0
)
GROUP BY ba."osm_id"
),
median_value AS (
SELECT
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_pois) AS median_pois
FROM poi_counts
),
closest_to_median AS (
SELECT
"osm_id",
total_pois,
ABS(total_pois - (SELECT median_pois FROM median_value)) AS diff_from_median
FROM poi_counts
)
SELECT
"osm_id"
FROM closest_to_median
ORDER BY diff_from_median
LIMIT 1;
SELECT
"ZIPSTART"."zip_code" AS zip_code_start,
"ZIPEND"."zip_code" AS zip_code_end
FROM
"NEW_YORK_CITIBIKE_1"."NEW_YORK_CITIBIKE"."CITIBIKE_TRIPS" AS "TRI"
INNER JOIN
"NEW_YORK_CITIBIKE_1"."GEO_US_BOUNDARIES"."ZIP_CODES" AS "ZIPSTART"
ON ST_WITHIN(
ST_POINT("TRI"."start_station_longitude", "TRI"."start_station_latitude"),
ST_GEOGFROMWKB("ZIPSTART"."zip_code_geom")
)
INNER JOIN
"NEW_YORK_CITIBIKE_1"."GEO_US_BOUNDARIES"."ZIP_CODES" AS "ZIPEND"
ON ST_WITHIN(
ST_POINT("TRI"."end_station_longitude", "TRI"."end_station_latitude"),
ST_GEOGFROMWKB("ZIPEND"."zip_code_geom")
)
INNER JOIN
"NEW_YORK_CITIBIKE_1"."NOAA_GSOD"."GSOD2015" AS "WEA"
ON TO_DATE(TO_CHAR("WEA"."year") || LPAD(TO_CHAR("WEA"."mo"), 2, '0') || LPAD(TO_CHAR("WEA"."da"), 2, '0'), 'YYYYMMDD') = DATE_TRUNC('DAY', TO_TIMESTAMP_NTZ(TO_NUMBER("TRI"."starttime") / 1000000))
WHERE
"WEA"."wban" = '94728'
AND DATE_TRUNC('DAY', TO_TIMESTAMP_NTZ(TO_NUMBER("TRI"."starttime") / 1000000)) = DATE '2015-07-15'
ORDER BY
"WEA"."temp" DESC, "ZIPSTART"."zip_code" ASC, "ZIPEND"."zip_code" DESC
LIMIT 1;
WITH repositories AS (
SELECT
t2."repo_name",
t2."language"
FROM (
SELECT
t1."repo_name",
t1."language",
RANK() OVER (PARTITION BY t1."repo_name" ORDER BY t1."language_bytes" DESC) AS "rank"
FROM (
SELECT
l."repo_name",
lang.value:"name"::STRING AS "language",
lang.value:"bytes"::NUMBER AS "language_bytes"
FROM
GITHUB_REPOS.GITHUB_REPOS.LANGUAGES AS l,
LATERAL FLATTEN(input => l."language") AS lang
) AS t1
) AS t2
WHERE t2."rank" = 1
),
python_repo AS (
SELECT
"repo_name",
"language"
FROM
repositories
WHERE
"language" = 'JavaScript'
)
SELECT
sc."repo_name",
COUNT(sc."commit") AS "num_commits"
FROM
GITHUB_REPOS.GITHUB_REPOS.SAMPLE_COMMITS AS sc
INNER JOIN
python_repo
ON
python_repo."repo_name" = sc."repo_name"
GROUP BY
sc."repo_name"
ORDER BY
"num_commits" DESC
LIMIT 2;
WITH json_files AS (
SELECT
c."id",
TRY_PARSE_JSON(c."content"):"require" AS "dependencies"
FROM
GITHUB_REPOS.GITHUB_REPOS.SAMPLE_CONTENTS c
),
package_names AS (
SELECT
f.key AS "package_name"
FROM
json_files,
LATERAL FLATTEN(input => "dependencies") AS f
)
SELECT
"package_name",
COUNT(*) AS "count"
FROM
package_names
WHERE
"package_name" IS NOT NULL
GROUP BY
"package_name"
ORDER BY
"count" DESC;
WITH
-- Studies that have MR volumes
"mr_studies" AS (
SELECT
"dicom_all_mr"."StudyInstanceUID"
FROM
"IDC"."IDC_V17"."DICOM_ALL" AS "dicom_all_mr"
WHERE
"Modality" = 'MR'
AND "collection_id" = 'qin_prostate_repeatability'
AND CONTAINS("SeriesDescription", 'T2 Weighted Axial')
),
"seg_studies" AS (
SELECT
"dicom_all_seg"."StudyInstanceUID"
FROM
"IDC"."IDC_V17"."DICOM_ALL" AS "dicom_all_seg"
JOIN
"IDC"."IDC_V17"."SEGMENTATIONS" AS "segmentations"
ON
"dicom_all_seg"."SOPInstanceUID" = "segmentations"."SOPInstanceUID"
WHERE
"collection_id" = 'qin_prostate_repeatability'
AND CONTAINS("segmentations"."SegmentedPropertyType":"CodeMeaning", 'Peripheral zone')
AND "segmentations"."SegmentedPropertyCategory":"CodeMeaning" = 'Anatomical Structure'
)
SELECT DISTINCT
"mr_studies"."StudyInstanceUID"
FROM
"mr_studies"
JOIN
"seg_studies"
ON
"mr_studies"."StudyInstanceUID" = "seg_studies"."StudyInstanceUID";
SELECT
"creative_page_url",
TO_TIMESTAMP(GET("region_stat".value, 'first_shown')) AS "first_shown",
TO_TIMESTAMP(GET("region_stat".value, 'last_shown')) AS "last_shown",
REPLACE(REPLACE("disapproval"[0]."removal_reason", '""', '"'), '"', '') AS "removal_reason",
REPLACE(REPLACE("disapproval"[0]."violation_category", '""', '"'), '"', '') AS "violation_category",
GET("region_stat".value, 'times_shown_lower_bound') AS "times_shown_lower",
GET("region_stat".value, 'times_shown_upper_bound') AS "times_shown_upper"
FROM
"GOOGLE_ADS"."GOOGLE_ADS_TRANSPARENCY_CENTER"."REMOVED_CREATIVE_STATS",
LATERAL FLATTEN(input => "region_stats") AS "region_stat"
WHERE
GET("region_stat".value, 'region_code') = 'HR'
AND GET("region_stat".value, 'times_shown_availability_date') IS NULL
AND GET("region_stat".value, 'times_shown_lower_bound') > 10000
AND GET("region_stat".value, 'times_shown_upper_bound') < 25000
AND (
GET("audience_selection_approach_info", 'demographic_info') != 'CRITERIA_UNUSED'
OR GET("audience_selection_approach_info", 'geo_location') != 'CRITERIA_UNUSED'
OR GET("audience_selection_approach_info", 'contextual_signals') != 'CRITERIA_UNUSED'
OR GET("audience_selection_approach_info", 'customer_lists') != 'CRITERIA_UNUSED'
OR GET("audience_selection_approach_info", 'topics_of_interest') != 'CRITERIA_UNUSED'
)
ORDER BY
"last_shown" DESC
LIMIT 5;
WITH
SpecimenPreparationSequence_unnested AS (
SELECT
d."SOPInstanceUID",
concept_name_code_sequence.value:"CodeMeaning"::STRING AS "cnc_cm",
concept_name_code_sequence.value:"CodingSchemeDesignator"::STRING AS "cnc_csd",
concept_name_code_sequence.value:"CodeValue"::STRING AS "cnc_val",
concept_code_sequence.value:"CodeMeaning"::STRING AS "ccs_cm",
concept_code_sequence.value:"CodingSchemeDesignator"::STRING AS "ccs_csd",
concept_code_sequence.value:"CodeValue"::STRING AS "ccs_val"
FROM
"IDC"."IDC_V17"."DICOM_ALL" AS d,
LATERAL FLATTEN(input => d."SpecimenDescriptionSequence") AS spec_desc,
LATERAL FLATTEN(input => spec_desc.value:"SpecimenPreparationSequence") AS prep_seq,
LATERAL FLATTEN(input => prep_seq.value:"SpecimenPreparationStepContentItemSequence") AS prep_step,
LATERAL FLATTEN(input => prep_step.value:"ConceptNameCodeSequence") AS concept_name_code_sequence,
LATERAL FLATTEN(input => prep_step.value:"ConceptCodeSequence") AS concept_code_sequence
),
slide_embedding AS (
SELECT
"SOPInstanceUID",
ARRAY_AGG(DISTINCT(CONCAT("ccs_cm", ':', "ccs_csd", ':', "ccs_val"))) AS "embeddingMedium_code_str"
FROM
SpecimenPreparationSequence_unnested
WHERE
"cnc_csd" = 'SCT' AND "cnc_val" = '430863003' -- CodeMeaning is 'Embedding medium'
GROUP BY
"SOPInstanceUID"
),
slide_staining AS (
SELECT
"SOPInstanceUID",
ARRAY_AGG(DISTINCT(CONCAT("ccs_cm", ':', "ccs_csd", ':', "ccs_val"))) AS "staining_usingSubstance_code_str"
FROM
SpecimenPreparationSequence_unnested
WHERE
"cnc_csd" = 'SCT' AND "cnc_val" = '424361007' -- CodeMeaning is 'Using substance'
GROUP BY
"SOPInstanceUID"
),
embedding_data AS (
SELECT
d."SOPInstanceUID",
d."instance_size",
e."embeddingMedium_code_str",
s."staining_usingSubstance_code_str"
FROM
"IDC"."IDC_V17"."DICOM_ALL" AS d
LEFT JOIN
slide_embedding AS e ON d."SOPInstanceUID" = e."SOPInstanceUID"
LEFT JOIN
slide_staining AS s ON d."SOPInstanceUID" = s."SOPInstanceUID"
WHERE
d."Modality" = 'SM'
)
SELECT
SPLIT_PART(embeddingMedium_CodeMeaning_flat.VALUE::STRING, ':', 1) AS "embeddingMedium_CodeMeaning",
SPLIT_PART(staining_usingSubstance_CodeMeaning_flat.VALUE::STRING, ':', 1) AS "staining_usingSubstance_CodeMeaning",
COUNT(*) AS "count_"
FROM
embedding_data
, LATERAL FLATTEN(input => embedding_data."embeddingMedium_code_str") AS embeddingMedium_CodeMeaning_flat
, LATERAL FLATTEN(input => embedding_data."staining_usingSubstance_code_str") AS staining_usingSubstance_CodeMeaning_flat
GROUP BY
SPLIT_PART(embeddingMedium_CodeMeaning_flat.VALUE::STRING, ':', 1),
SPLIT_PART(staining_usingSubstance_CodeMeaning_flat.VALUE::STRING, ':', 1);
WITH
nonLocalizerRawData AS (
SELECT
"SeriesInstanceUID",
"StudyInstanceUID",
"PatientID",
TRY_CAST("Exposure"::STRING AS FLOAT) AS "Exposure", -- 直接从 bid 获取 Exposure
TRY_CAST(axes.VALUE::STRING AS FLOAT) AS "zImagePosition",
LEAD(TRY_CAST(axes.VALUE::STRING AS FLOAT)) OVER (
PARTITION BY "SeriesInstanceUID"
ORDER BY TRY_CAST(axes.VALUE::STRING AS FLOAT)
) - TRY_CAST(axes.VALUE::STRING AS FLOAT) AS "slice_interval",
"instance_size" AS "instanceSize"
FROM
"IDC"."IDC_V17"."DICOM_ALL" AS "bid",
LATERAL FLATTEN(input => "bid"."ImagePositionPatient") AS axes -- 使用 LATERAL FLATTEN 展开数组
WHERE
"collection_id" = 'nlst'
AND "Modality" = 'CT'
),
geometryChecks AS (
SELECT
"SeriesInstanceUID",
"StudyInstanceUID",
"PatientID",
ARRAY_AGG(DISTINCT "slice_interval") AS "sliceIntervalDifferences",
ARRAY_AGG(DISTINCT "Exposure") AS "distinctExposures",
SUM("instanceSize") / 1024 / 1024 AS "seriesSizeInMB"
FROM
nonLocalizerRawData
GROUP BY
"SeriesInstanceUID",
"StudyInstanceUID",
"PatientID"
),
patientMetrics AS (
SELECT
"PatientID",
MAX(TRY_CAST(sid.VALUE::STRING AS FLOAT)) AS "maxSliceIntervalDifference",
MIN(TRY_CAST(sid.VALUE::STRING AS FLOAT)) AS "minSliceIntervalDifference",
MAX(TRY_CAST(sid.VALUE::STRING AS FLOAT)) - MIN(TRY_CAST(sid.VALUE::STRING AS FLOAT)) AS "sliceIntervalDifferenceTolerance",
MAX(TRY_CAST(de.VALUE::STRING AS FLOAT)) AS "maxExposure",
MIN(TRY_CAST(de.VALUE::STRING AS FLOAT)) AS "minExposure",
MAX(TRY_CAST(de.VALUE::STRING AS FLOAT)) - MIN(TRY_CAST(de.VALUE::STRING AS FLOAT)) AS "maxExposureDifference",
"seriesSizeInMB"
FROM
geometryChecks,
LATERAL FLATTEN(input => "sliceIntervalDifferences") AS sid, -- 展开 sliceIntervalDifferences
LATERAL FLATTEN(input => "distinctExposures") AS de -- 展开 distinctExposures
WHERE
sid.VALUE IS NOT NULL
AND de.VALUE IS NOT NULL
GROUP BY
"PatientID",
"seriesSizeInMB"
),
top3BySliceInterval AS (
SELECT
"PatientID",
"seriesSizeInMB"
FROM
patientMetrics
ORDER BY
"sliceIntervalDifferenceTolerance" DESC
LIMIT 3
),
top3ByMaxExposure AS (
SELECT
"PatientID",
"seriesSizeInMB"
FROM
patientMetrics
ORDER BY
"maxExposureDifference" DESC
LIMIT 3
)
SELECT
'Top 3 by Slice Interval' AS "MetricGroup",
AVG("seriesSizeInMB") AS "AverageSeriesSizeInMB"
FROM
top3BySliceInterval
UNION ALL
SELECT
'Top 3 by Max Exposure' AS "MetricGroup",
AVG("seriesSizeInMB") AS "AverageSeriesSizeInMB"
FROM
top3ByMaxExposure;
WITH median_income_diff_by_zipcode AS (
WITH acs_2018 AS (
SELECT
"geo_id",
"median_income" AS "median_income_2018"
FROM
CENSUS_BUREAU_ACS_2.CENSUS_BUREAU_ACS."ZIP_CODES_2018_5YR"
),
acs_2015 AS (
SELECT
"geo_id",
"median_income" AS "median_income_2015"
FROM
CENSUS_BUREAU_ACS_2.CENSUS_BUREAU_ACS."ZIP_CODES_2015_5YR"
),
acs_diff AS (
SELECT
a18."geo_id",
(a18."median_income_2018" - a15."median_income_2015") AS "median_income_diff"
FROM
acs_2018 a18
JOIN
acs_2015 a15 ON a18."geo_id" = a15."geo_id"
)
SELECT
"geo_id",
AVG("median_income_diff") AS "avg_median_income_diff"
FROM
acs_diff
WHERE
"median_income_diff" IS NOT NULL
GROUP BY "geo_id"
),
base_census AS (
SELECT
geo."state_name",
AVG(i."avg_median_income_diff") AS "avg_median_income_diff",
AVG(
"employed_wholesale_trade" * 0.38423645320197042 +
"occupation_natural_resources_construction_maintenance" * 0.48071410777129553 +
"employed_arts_entertainment_recreation_accommodation_food" * 0.89455676291236841 +
"employed_information" * 0.31315240083507306 +
"employed_retail_trade" * 0.51
) AS "avg_vulnerable"
FROM
CENSUS_BUREAU_ACS_2.CENSUS_BUREAU_ACS."ZIP_CODES_2017_5YR" AS census
JOIN
median_income_diff_by_zipcode i ON CAST(census."geo_id" AS STRING) = i."geo_id"
JOIN
CENSUS_BUREAU_ACS_2.GEO_US_BOUNDARIES."ZIP_CODES" geo ON census."geo_id" = geo."zip_code"
GROUP BY geo."state_name"
)
SELECT
"state_name",
"avg_median_income_diff",
"avg_vulnerable"
FROM
base_census
ORDER BY
"avg_median_income_diff" DESC
LIMIT 5;
WITH parsed_burn_logs AS (
SELECT
logs."block_timestamp" AS block_timestamp,
logs."block_number" AS block_number,
logs."transaction_hash" AS transaction_hash,
logs."log_index" AS log_index,
PARSE_JSON(logs."data") AS data,
logs."topics"
FROM CRYPTO.CRYPTO_ETHEREUM.LOGS AS logs
WHERE logs."address" = '0x8ad599c3a0ff1de082011efddc58f1908eb6e6d8'
AND logs."topics"[0] = '0x0c396cd989a39f4459b5fa1aed6a9a8dcdbc45908acfd67e028cd568da98982c'
),
parsed_mint_logs AS (
SELECT
logs."block_timestamp" AS block_timestamp,
logs."block_number" AS block_number,
logs."transaction_hash" AS transaction_hash,
logs."log_index" AS log_index,
PARSE_JSON(logs."data") AS data,
logs."topics"
FROM CRYPTO.CRYPTO_ETHEREUM.LOGS AS logs
WHERE logs."address" = '0x8ad599c3a0ff1de082011efddc58f1908eb6e6d8'
AND logs."topics"[0] = '0x7a53080ba414158be7ec69b987b5fb7d07dee101fe85488f0853ae16239d0bde'
)
SELECT
block_timestamp,
block_number,
transaction_hash
FROM parsed_mint_logs
UNION ALL
SELECT
block_timestamp,
block_number,
transaction_hash
FROM parsed_burn_logs
ORDER BY block_timestamp
LIMIT 5;
WITH
-- Create a common table expression (CTE) named localizerAndJpegCompressedSeries
localizerAndJpegCompressedSeries AS (
SELECT
"SeriesInstanceUID"
FROM
IDC.IDC_V17."DICOM_ALL" AS bid
WHERE
"ImageType" = 'LOCALIZER' OR
"TransferSyntaxUID" IN ('1.2.840.10008.1.2.4.70', '1.2.840.10008.1.2.4.51')
),
-- Create a common table expression (CTE) for x_vector calculation (first three elements)
imageOrientation AS (
SELECT
"SeriesInstanceUID",
ARRAY_AGG(CAST(part.value AS FLOAT)) AS "x_vector"
FROM
IDC.IDC_V17."DICOM_ALL" AS bid,
LATERAL FLATTEN(input => bid."ImageOrientationPatient") AS part
WHERE
part.index BETWEEN 0 AND 2
GROUP BY "SeriesInstanceUID"
),
-- Create a common table expression (CTE) for y_vector calculation (next three elements)
imageOrientationY AS (
SELECT
"SeriesInstanceUID",
ARRAY_AGG(CAST(part.value AS FLOAT)) AS "y_vector"
FROM
IDC.IDC_V17."DICOM_ALL" AS bid,
LATERAL FLATTEN(input => bid."ImageOrientationPatient") AS part
WHERE
part.index BETWEEN 3 AND 5
GROUP BY "SeriesInstanceUID"
),
-- Create a common table expression (CTE) named nonLocalizerRawData
nonLocalizerRawData AS (
SELECT
bid."SeriesInstanceUID", -- Added table alias bid
bid."StudyInstanceUID",
bid."PatientID",
bid."SOPInstanceUID",
bid."SliceThickness",
bid."ImageType",
bid."TransferSyntaxUID",
bid."SeriesNumber",
bid."aws_bucket",
bid."crdc_series_uuid",
CAST(bid."Exposure" AS FLOAT) AS "Exposure", -- Use CAST directly
CAST(ipp.value AS FLOAT) AS "zImagePosition", -- Use CAST directly
CONCAT(ipp2.value, '/', ipp3.value) AS "xyImagePosition",
LEAD(CAST(ipp.value AS FLOAT)) OVER (PARTITION BY bid."SeriesInstanceUID" ORDER BY CAST(ipp.value AS FLOAT)) - CAST(ipp.value AS FLOAT) AS "slice_interval",
ARRAY_TO_STRING(bid."ImageOrientationPatient", '/') AS "iop",
bid."PixelSpacing",
bid."Rows" AS "pixelRows",
bid."Columns" AS "pixelColumns",
bid."instance_size" AS "instanceSize"
FROM
IDC.IDC_V17."DICOM_ALL" AS bid
LEFT JOIN LATERAL FLATTEN(input => bid."ImagePositionPatient") AS ipp
LEFT JOIN LATERAL FLATTEN(input => bid."ImagePositionPatient") AS ipp2
LEFT JOIN LATERAL FLATTEN(input => bid."ImagePositionPatient") AS ipp3
WHERE
bid."collection_id" != 'nlst'
AND bid."Modality" = 'CT'
AND ipp.index = 2
AND ipp2.index = 0
AND ipp3.index = 1
AND bid."SeriesInstanceUID" NOT IN (SELECT "SeriesInstanceUID" FROM localizerAndJpegCompressedSeries)
),
-- Cross product calculation
crossProduct AS (
SELECT
nld."SOPInstanceUID", -- Added table alias nld
nld."SeriesInstanceUID", -- Added table alias nld
OBJECT_CONSTRUCT(
'x', ("x_vector"[1] * "y_vector"[2] - "x_vector"[2] * "y_vector"[1]),
'y', ("x_vector"[2] * "y_vector"[0] - "x_vector"[0] * "y_vector"[2]),
'z', ("x_vector"[0] * "y_vector"[1] - "x_vector"[1] * "y_vector"[0])
) AS "xyCrossProduct"
FROM
nonLocalizerRawData AS nld -- Added alias for nonLocalizerRawData
JOIN imageOrientation AS io ON nld."SeriesInstanceUID" = io."SeriesInstanceUID"
JOIN imageOrientationY AS ioy ON nld."SeriesInstanceUID" = ioy."SeriesInstanceUID"
),
-- Cross product elements extraction and row numbering
crossProductElements AS (
SELECT
cp."SOPInstanceUID",
cp."SeriesInstanceUID",
elem.value,
ROW_NUMBER() OVER (PARTITION BY cp."SOPInstanceUID", cp."SeriesInstanceUID" ORDER BY elem.value) AS rn
FROM
crossProduct AS cp
-- Use LATERAL FLATTEN to explode the cross product object into individual 'x', 'y', and 'z'
JOIN LATERAL FLATTEN(input => ARRAY_CONSTRUCT(
cp."xyCrossProduct"['x'],
cp."xyCrossProduct"['y'],
cp."xyCrossProduct"['z']
)) AS elem -- Simplified 'elem.value' reference here
),
-- Dot product calculation
dotProduct AS (
SELECT
cpe."SOPInstanceUID",
cpe."SeriesInstanceUID",
SUM(
CASE
WHEN cpe.rn = 1 THEN cpe.value * 0 -- x * 0
WHEN cpe.rn = 2 THEN cpe.value * 0 -- y * 0
WHEN cpe.rn = 3 THEN cpe.value * 1 -- z * 1
END
) AS "xyDotProduct"
FROM
crossProductElements AS cpe
GROUP BY
cpe."SOPInstanceUID",
cpe."SeriesInstanceUID"
),
-- Geometry checks for series consistency
geometryChecks AS (
SELECT
gc."SeriesInstanceUID", -- Added table alias gc
gc."SeriesNumber",
gc."aws_bucket",
gc."crdc_series_uuid",
gc."StudyInstanceUID",
gc."PatientID",
ARRAY_AGG(DISTINCT gc."slice_interval") AS "sliceIntervalDifferences",
ARRAY_AGG(DISTINCT gc."Exposure") AS "distinctExposures",
COUNT(DISTINCT gc."iop") AS "iopCount",
COUNT(DISTINCT gc."PixelSpacing") AS "pixelSpacingCount",
COUNT(DISTINCT gc."zImagePosition") AS "positionCount",
COUNT(DISTINCT gc."xyImagePosition") AS "xyPositionCount",
COUNT(DISTINCT gc."SOPInstanceUID") AS "sopInstanceCount",
COUNT(DISTINCT gc."SliceThickness") AS "sliceThicknessCount",
COUNT(DISTINCT gc."Exposure") AS "exposureCount",
COUNT(DISTINCT gc."pixelRows") AS "pixelRowCount",
COUNT(DISTINCT gc."pixelColumns") AS "pixelColumnCount",
dp."xyDotProduct", -- Added xyDotProduct from dotProduct
SUM(gc."instanceSize") / 1024 / 1024 AS "seriesSizeInMiB"
FROM
nonLocalizerRawData AS gc -- Added table alias gc
JOIN dotProduct AS dp ON gc."SeriesInstanceUID" = dp."SeriesInstanceUID"
AND gc."SOPInstanceUID" = dp."SOPInstanceUID"
GROUP BY
gc."SeriesInstanceUID",
gc."SeriesNumber",
gc."aws_bucket",
gc."crdc_series_uuid",
gc."StudyInstanceUID",
gc."PatientID",
dp."xyDotProduct" -- Include xyDotProduct in GROUP BY
HAVING
COUNT(DISTINCT gc."iop") = 1
AND COUNT(DISTINCT gc."PixelSpacing") = 1
AND COUNT(DISTINCT gc."SOPInstanceUID") = COUNT(DISTINCT gc."zImagePosition")
AND COUNT(DISTINCT gc."xyImagePosition") = 1
AND COUNT(DISTINCT gc."pixelRows") = 1
AND COUNT(DISTINCT gc."pixelColumns") = 1
AND ABS(dp."xyDotProduct") BETWEEN 0.99 AND 1.01
)
SELECT
geometryChecks."SeriesInstanceUID", -- Added table alias
geometryChecks."SeriesNumber", -- Added table alias
geometryChecks."PatientID", -- Added table alias
geometryChecks."seriesSizeInMiB"
FROM
geometryChecks
ORDER BY
geometryChecks."seriesSizeInMiB" DESC
LIMIT 5;