# Import libraries (all that will be needed for this project)

import earthpy # local data management
import pandas as pd
import numpy as np
import hvplot.pandas # For making an interactive plot
import holoviews as hv
import matplotlib.pyplot as plt # advanced plotting options
import seaborn as sns # statistical plots for tabular data
# Fit an OLS linear regression
from sklearn.linear_model import LinearRegression

print('Done!')

# Access Observed Temperature data from the Bozeman Weather station

bzn_tobs_url = ('https://www.ncei.noaa.gov/access/services/da'
'ta/v1?dataset=daily-summaries&dataTypes=TOBS&stations=USC00241044&'
'startDate=1892-04-08&endDate=2025-09-25&units=standard')


bzn_tobs_url

# Download the climate data
# I'm storing the Tobs data in bzn_t_df_raw
bzn_t_raw_df = pd.read_csv(
    bzn_tobs_url,
    index_col='DATE',
    parse_dates=True,
    na_values=['NaN']
)

# Check that the download worked
bzn_t_raw_df.tail()

# Save the climate data for later use

bzn_t_raw_df.to_csv('bozeman_tobs_1892-2025')

# Check that the data were imported to a DataFrame

type(bzn_t_raw_df)

# Plot the data as a histogram to check that NaNs were imported correctly

bzn_t_raw_df.plot.hist()

# Plot data as a line plot

bzn_t_raw_df.plot(
    y= 'TOBS',
    title='Daily Temperature in Bozeman, MT',
    xlabel='Year',
    ylabel='Temperature ($^\circ$F)',
    legend=False
    )

# Clean up the DataFrame items and labels

# This step removes Station ID name as we don't need it in the DF,
# removes years with many NaNs, it renames 'TOBS' to temp_f, 
# and adds a column converted to Celsius.

bzn_t_int_df = bzn_t_raw_df[['TOBS']]
bzn_t_df = bzn_t_int_df[(
    pd.to_datetime(bzn_t_raw_df.index).year >= 1933) 
    & 
    (pd.to_datetime(bzn_t_raw_df.index).year <= 2024)]
bzn_t_df = bzn_t_df.dropna()
bzn_t_df = bzn_t_df.rename(columns={'TOBS':'temp_f'})
bzn_t_df['temp_c'] = (bzn_t_df['temp_f'] - 32) * (5 / 9)
bzn_t_df

# Plot data as a line plot

bzn_t_df.plot(
    y= 'temp_f',
    title='Daily Temperature in Bozeman, MT',
    xlabel='Year',
    ylabel='Temperature ($^\circ$F)',
    legend=False
    )

# Resample the data

bzn_t_ann_df = bzn_t_df.resample('YE').mean()
bzn_t_ann_df

# Plot the data to make sure this looks better

bzn_t_ann_plot = bzn_t_ann_df.plot(
    y= 'temp_f',
    title='Daily Temperature in Bozeman, MT',
    xlabel='Year',
    ylabel='Temperature ($^\circ$ F)',
    legend=False
)

plt.savefig('bzn_t_ann_plot.png')
plt.show()
# bzn_t_ann_plot

# Make an interactive plot

bzn_t_ann_plot_int = bzn_t_ann_df.hvplot(
    y= 'temp_f',
    title='Daily Temperature in Bozeman, MT',
    xlabel='Year',
    ylabel='Temperature ($^\circ$ F)',
    legend=False   
)

bzn_t_ann_plot_int

# Save plot

hv.save(bzn_t_ann_plot_int, 'bzn_t_ann_plot_int.html')

# Fit an OLS Linear Regression to the data (ChatGPT query)
# I have marked ChatGPT comments with double hashtags (##)

from sklearn.linear_model import LinearRegression
import numpy as np

# Reset the DataFrame index to a column within the DataFrame. I have abbreviated the 
# variable name to bzn_t_reind_df so it doesn't get unwieldy.
## Assuming df is your DataFrame with 'Year' as the index and 'Temperature' as the column
bzn_t_reind_df = bzn_t_ann_df.reset_index()

# Further clean the DataFrame to work with Seaborn and scikit.
# The way 'DATE' is formatted causes errors with regplot(), so I need to just select the 
# year out of 'DATE'.
# Clean the data of NaNs before calculating the fit (Google Gemini)

bzn_t_reind_df['DATE'] = pd.to_datetime(bzn_t_reind_df['DATE'])
bzn_t_reind_df['year'] = bzn_t_reind_df['DATE'].dt.year
bzn_t_lin_reg_df = bzn_t_reind_df.dropna()

## Reshape 'Year' column to be a 2D array for scikit-learn
x = bzn_t_lin_reg_df[['year']].values
y_f = bzn_t_lin_reg_df['temp_f'].values

## Create and fit the linear regression model
model = LinearRegression()
model.fit(x, y_f)

## Get the slope and intercept
slope = model.coef_[0]
intercept = model.intercept_

## Print the results
print(f"Slope: {slope}")
print(f"Intercept: {intercept}")

# Plot annual average temperature with a trend line
bzn_plot_linreg = sns.regplot(
    x= bzn_t_lin_reg_df[['year']], 
    y= bzn_t_lin_reg_df[['temp_f']],
    data= bzn_t_lin_reg_df,
    # type = 'line'
    ci= None
)
# Set plot labels
bzn_plot_linreg.set(
    title='OLS Regression of Annual Mean Temperature ($^\circ$F) in Bozeman, MT',
    xlabel='Time',
    ylabel='Temperature ($^\circ$F)'
)
# Save figure andDisplay the plot without extra text
plt.savefig('bzn_plot_linreg.png')
plt.show()

Climate Change in Bozeman, Montana¶

Import the Data and Organize into a DataFrame¶

Plot the data to start looking at them¶

First Glance Interpretation¶

Data Cleaning¶

Plotting the Data¶

Resample the data to an annual mean and plot interactively¶

Interpretation¶

Fit a linear regression to the data¶

Plot the linear regression¶