plt.figure(figsize = (12, 4))
plt.bar(data.index, data['nutri'], color = 'darkblue', label = 'nutri')
plt.bar(data.index, data['mineral'], color = 'brown', label = 'mineral')
plt.title('Comparison of Nutrient and Mineral Content')
plt.xlabel('Nutrient|Mineral')
plt.ylabel('Amount')
plt.legend()
plt.show()
class Designer(Employee):
def __init__(self, name, seniority, awards=2):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
# for each accreditation, increase the counter by 1
# for now we assume that all designers are accredited
self.seniority += 1
# condition for promoting an employee from the presentation
if (self.seniority) + (self.intlawards*2) % 7 == 0:
self.grade_up()
# publication of the results
return self.publish_grade()
def find_duplicates(lst):
new_lst = []
for i in lst:
if i[0] not in new_lst:
new_lst.append(i[0])
else:
i[0] = "9090"
return new_lst
find_duplicates(lst)
df_new['incident_type'].value_counts()[df_new['incident_type'].value_counts() >= 5].sort_values(ascending = False)
def checkio(str):
cnt = 0
for i in str.split():
cnt = cnt + 1 if i.isalpha() else 0
if cnt >= 3:
return True
return False
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('data.csv')
df.sort_values('perc_of_5star', inplace=True)
df.reset_index(inplace=True)
plt.hlines(y=df.decade, xmin=0, xmax=df.perc_of_5star, color='skyblue')
plt.gca().invert_yaxis()
plt.show()
import numpy as np
import pandas as pd
df=pd.DataFrame(data={'rate_group':['A%','B%', 'C']})
df['rate_group'] = df['rate_group'].str.replace('%', '')
df['rate_group']=df['rate_group'].replace(r'^\s*$', np.nan, regex=True)
print(df['rate_group'])
def validate_usr(username):
import re
if len(username) < 4 or len(username) > 16:
return False
else:
return bool(re.match("^[a-z0-9_]*$", username))
import pandas as pd
sl = [0.05, 0.4, 0.5, 0.95]
sw = [0.7, 0.8, 0.3, 0.9]
data = {"sl": sl, "sw": sw}
def name_list(group, dct):
lst = []
for key, value in dct.items():
if value[-1] == group:
lst.append(value[0] + ' ' + value[1] + ' ' + value[2])
lst.sort()
for i in range(len(lst)):
print('{}. {}'.format(i + 1, lst[i]))
name_list('BST161', dct)
def replace(students, stnums):
for student in students:
if student[0] in stnums:
student[0] = replacing_num
return students
from math import sqrt
def conf_interval(n, mean, sig, conf):
z_value = abs(conf - mean)
sq_n = sqrt(n)
interval = z_value / sq_n
return interval
conf_interval(3, 1, 2, 1)
# Solution 1
df.groupby(['userId'])['rating'].sum().sort_values(ascending = False).head()
# Solution 2
df.groupby(['userId'])['rating'].sum().sort_values(ascending = False).head().reset_index()
def goes_after(word, first, second):
for i in range(len(word) - 1):
if word[i] == first and word[i+1] == second:
return True
return False
import numpy as np
def system_solver(a, b, c, d, e, f):
return np.linalg.solve(a, b)
a = np.array([[4, 2, 1], [1, 3, 0], [0, 5, 4]])
b = np.array([4, 12, -3])
system_solver(a, b)
def fill_na(df, col_name='rectal_temp', range_thresh=0.2):
col_vals = df[col_name]
col_vals_notnull_ind = col_vals.notnull()
col_vals_notnull_ind = col_vals_notnull_ind.values[:, np.newaxis]
col_vals_notnull = col_vals.loc[col_vals_notnull_ind].values
col_vals_notnull_idx = col_vals_notnull_ind.nonzero()[0]
for i in range(0, col_vals.shape[0]):
if not col_vals_notnull_ind[i]:
for j in range(0, col_vals_notnull.shape[0]):
if (col_vals_notnull[j] - col_vals_notnull[j] * range_thresh) < col_vals[i] < (col_vals_notnull[j] + col_vals_notnull[j] * range_
def convert_to_dict(students):
students_dict = {}
for student in students:
students_dict[student[0]] = student[1:]
return students_dict
convert_to_dict(students)
def group_by_rating(df):
grouped = df.groupby('decade')
result = {}
for decade, group in grouped:
result[decade] = (group.rating == 5.0).sum() / len(group)
return result
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp as multi
%matplotlib inline
data = pd.read_csv('/Users/josephfrasca/Coding/Python/Python-Practice/Stats/ToothGrowth.csv')
data.head()
data.head()
def max_value(dct):
value = 0
for k, v in dct.items():
if v["Value"] > value:
key = k
value = v["Value"]
return key
max_value(dct)
def find_non_numbers(data, column):
return data[~pd.to_numeric(data[column], errors='coerce').notnull()]
#OR
import re
def find_non_numbers(data, column):
return data[~data[column].map(lambda x: bool(re.search(r'[a-zA-Z]', x)))]
def replace_sec(stnums, students, replacing_num):
for stnum in stnums:
num = 0
for student in students:
if stnum == student[0]:
num += 1
if num > 1:
student[0] = replacing_num
return students
replace_sec(stnums, students, replacing_num)
#function that: There is a Pandas dataframe
import pandas as pd
def merge_df(df):
df.drop_duplicates(subset = ["name"],
keep = 'first', inplace = True)
return df.sort_values(by = 'name')
df = pd.read_csv('test_data.csv')
merge_df(df)
def hotpo(n):
steps = 0
while n > 1:
if n % 2 == 0:
n /= 2
else:
n = 3 * n + 1
steps += 1
return steps
def plot_boxplots(p1, p2):
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.boxplot(p1)
ax1.boxplot(p2)
plt.show()
plot_boxplots(p.speed, boosted_p.speed)
def add_777999_555333_11110_77_7_00_0(a):
if a == 777:
return 200
elif a == 999:
return 100
elif a == 555:
return 50
elif a == 333:
return 15
elif a == 111:
return 10
elif a%10 == 7 and a//10%10 == 7:
return 5
elif a%10 == 7:
return 3
elif a//10%10 == 0 and a//100%10 == 0:
return 2
elif a//10%10 == 0:
return 1
else:
return 0
def find_score(num_trials):
money = 0
successes = 0
for i in range(num_trials):
money += add_777999_555333_11110_77_7_00_0(np.random.randint(1, 1000))
successes += 1
return money, successes
def sale_hotdogs(n):
return (n < 5) * (n * 100) + (n >= 5 and n < 10) * (n * 95) + (n >= 10) * (n * 90)
def add(a, b):
return a + b
add(1, 2)
def graph_difference(data, title):
difference = data['nutri'] - data['mineral']
percentage = (difference / data['nutri']) * 100
plt.figure(figsize = (12, 4))
plt.bar(data.index, data['nutri'], color = 'darkblue', label = 'nutri')
plt.bar(data.index, data['mineral'], color = 'brown', label = 'mineral')
plt.title(title, fontsize = 20)
plt.grid(axis = 'y')
for i in range(len(data)):
plt.text(x = data.index[i] - 0.3, y = data['nutri'][i] + 0.1, s = str(round(percentage[i], 1)) + '%', size = 12)
plt.show()
def guess_blue(blue_start, red_start, blue_pulled, red_pulled):
return (blue_start - blue_pulled) / (blue_start - blue_pulled + red_start - red_pulled)
from sklearn import datasets
import numpy as np
X, y = datasets.load_svmlight_file('data.txt')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train = X_train.toarray()
X_test = X_test.toarray()
y_train = y_train.astype('float')
y_test = y_test.astype('float')
def get_needed_posts(query):
site = pd.DataFrame(columns=['date', 'title', 'link'])
for q in query:
URL = parseurl+'search/'
params = {
'q': q
}
req = requests.get(URL, params=params)
soup = BeautifulSoup(req.text)
articles = soup.find_all('article', class_='tm-articles')
for article in articles:
try:
title = article.find('h2', class_='tm-article').text
date = article.find('span', class_='tm-article').text.strip()
link = article.find('h2', class_='tm-article').find('a').get('href')
except:
pass
if link not in site.link.values:
row = {'date': date, 'title': title, 'link': 'https://habr.com'+link}
site = pd.concat([site, pd
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
gdpdiff = gdpdiff.dropna(subset=['GDP per capita'])
gdpdiff = gdpdiff[gdpdiff['GDP per capita'] > 0]
gdpdiff = gdpdiff.sort_values(by='GDP per capita')
gdpdiff['Country or region'].replace({
'United States': 'USA',
'United Kingdom': 'UK'
})
gdpdiff.plot(x='Country or region', y='GDP per capita', kind='bar')
import plotly.express as px
fig = px.bar(y = label, x = counts)
fig.show()
import pandas as pd
ratings = pd.read_csv('ratings.csv')
print(ratings.head())
# Create a function which return the average lifetime of users
def lifetime(group):
return group.max() - group.min()
lifetime_users = ratings.groupby('userId').agg(lifetime)
print(lifetime_users)
average_lifetime = lifetime_users['timestamp'].mean()
print(average_lifetime)
# output:
# userId movieId rating timestamp
# 0 1 31 2.5 1260759144
# 1 1 1029 3.0 1260759179
# 2 1 1061 3.0 1260759182
# 3 1 1129 2.0 1260759185
# 4 1 1172 4.0 1260759205
# timestamp
# userId
# 1 203560
# 2 866607
# 3 8
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv(filename)
df.plot(kind='bar',x='performer',y='number_of_hits')
plt.show()
import pandas as pd
df = pd.read_csv('artist_song_chart_debut.csv')
df['chart_debut'] = df['chart_debut'].apply(lambda x: str(x)[:4])
df
df_hot_years = df_ru[df_ru["av_temp"] > 12]["year"] "
print(df_hot_years)
def sum_of_differences(arr):
if len(arr) <= 1:
return 0
arr.sort(reverse=True)
return sum(arr[i] - arr[i + 1] for i in range(len(arr) - 1))
#Fix it!
misclassified = []
for index, label_predicted in enumerate(predicted):
if label_predicted != y_test[index]:
misclassified.append({'message': X_test.iloc[index]['Message'], 'actual': y_test.iloc[index]['Category'], 'predicted': label_predicted})
misclassification_df = pd.DataFrame(misclassified)
def check_sample_size(conf = 0.95, delta = 0.05, sigsqr = 225):
"""
confidence level - уровень доверия
delta - погрешность
sigsqr - дисперсия
"""
p = 1 - ((1 - conf) / 2)
z = norm.ppf(p)
n = (z ** 2) * sigsqr / delta ** 2
return int(n)
print(check_sample_size())
import re
def rate_group(int_rate):
int_rate = re.sub('%', '', int_rate)
int_rate = float(int_rate)
if int_rate > 15.0:
return '>15'
elif 10.0 < int_rate <= 15.0:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
def interval(n, mean, sig, conf):
h = sig/(n**(1/2))*z
return round(h*2)
fig, ax = plt.subplots()
df.plot(kind='scatter',
x='User_Score',
y='Global_Sales',
ax=ax)
df.plot(kind='scatter',
x='Critic_Score',
y='Global_Sales',
ax=ax)
plt.ylim(0, 40)
df.groupby(level=0).cumcount()
def function(x, y, z):
return x + y
function(1, 2)
# def my_mannwhitneyu(data, col, by):
def my_mannwhitneyu(data, col1, col2):
# select data for the two conditions
# data1 = data[data[by] == 'gate_30']
# data2 = data[data[by] == 'gate_40']
data1 = data[data[col1] == col1]
data2 = data[data[col2] == col2]
# calculate Mann-Whitney U-statistics and p-value
stat, p = mannwhitneyu(data1, data2)
print('Statistics=%.3f, p=%.3f' % (stat, p))
return stat, p
my_mannwhitneyu(data, 'gate_30', 'gate_40')
def glue_list(lst):
glue_string = ''
for i in lst:
glue_string += str(i)
return glue_string
print(glue_list([0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1]))
def money_add(lst):
for i in lst:
if i >= 100:
money += 1
def date_range(start_date, end_date):
lst = []
if start_date > end_date:
return []
else:
try:
start = (dt.strptime(start_date, '%Y-%m-%d'))
end = (dt.strptime(end_date, '%Y-%m-%d'))
while start <= end:
lst.append(start.strftime('%Y-%m-%d'))
start += td(days=1)
return lst
except:
return []
date_range('2022-12-01', '2022-12-05')
def group_list(dct, gr):
group_list = []
for k, v in dct.items():
if gr in v:
group_list.append(' '.join(dct[k][0:3]))
group_list.sort()
return group_list
group_list(dct, 'BST161')
def bar_plot(question_text, title, y_title, name):
question = df[question_text].value_counts()
label = question.index
counts = question.values
fig = px.bar(x=counts, y=label, orientation='h')
fig.update_layout(title_text=title)
fig.update_yaxes(title_text=y_title)
fig.update_xaxes(title_text=name)
fig.show()
question_text = 'What is the most preferred working environment for you.'
title = 'Какая рабочая среда для вас наиболее предпочтительна?'
y_title = 'Ответы'
name = 'Количество ответов'
bar_plot(question_
def year_leaders(df):
df = df.copy()
df.drop(columns=['time_on_chart', 'consecutive_weeks', 'decade', 'num_of_hits'], inplace=True)
years = df.chart_debut.unique()
year_leaders_df = pd.DataFrame()
for year in years:
year_df = df[df.chart_debut == year]
year_max_row = year_df.loc[year_df.num_of_hits.idxmax()]
year_leaders_df = year_leaders_df.append(year_max_row, ignore_index=True)
return year_leaders_df
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, cats, test_size=0.3, random_state=42)
for i in winnums:
print(i)
print(i +=1)
paid['App'].groupby(paid['Type']).sum()\
.plot(kind='pie',
figsize=(5, 6),
autopct='%1.1f%%',
startangle=90,
title='Процент бесплатных и платных приложений',
legend=True)
def delete_rock_rows(df):
...
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
df_scaled = std_scaler.fit_transform(df)
print(df_scaled)
def total_ingredients(cook_book):
dish = 'salad'
portions = 5
grams = cook_book['quantity'] * portions
if key == 'salad':
grams = 'quantity' * portions
print(grams)
cook_book = {'salad': [{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'g'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pcs'},
{'ingridient_name': 'cucumbers', 'quantity': 100, 'measure': 'g'}]}
total_ingredients(cook_book)
from datetime import datetime as dt
from datetime import timedelta as td
def date_range(start_date, end_date):
lst = []
if start_date > end_date:
return []
else:
try:
start = (dt.strptime(start_date, '%Y-%m-%d'))
end = (dt.strptime(end_date, '%Y-%m-%d'))
interval = end - start
for i in range(interval.days + 1):
lst.append(start.strftime('%Y-%m-%d'))
start = start + td(days=1)
return lst
except:
return 'Incorrect date format'
date_range('2022-12-01', '2022-12-20')
def rate_group(value):
if value > 15.00:
return '>15'
elif 10.00 <= value <= 15.00:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
df
def max_dct(dct, key):
return max(dct, key=lambda key: dct[key]['Value'])
dct = {
'value1': {'Value': 3},
'value2': {'Value': 1},
'value3': {'Value': 2},
}
max_dct(dct, 'Value') # returns 'value1'
import math
def sample_size(delta, sigsqr, conf):
return math.ceil( (sigsqr * (math.log(1 / (1 - conf)) / delta)**2) / 2 )
sample_size(0.05, 0.02, 0.95)
def year_leaders(df):
df = df.groupby('chart_debut').apply(lambda x: x.nlargest(1, 'num_of_hits'))
df = df.reset_index()
df = df.drop(columns = ['level_1'])
return df
from sklearn.preprocessing import StandardScaler
def standardize(df):
return df.apply(StandardScaler().fit_transform)
standardize(df)
def years():
return list(range(1950, 2011))
years()
def multiply_dict(key):
grams = 0
cook_book = {'salad': [{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'gr'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pct'},
{'ingridient_name': 'pepper', 'quantity': 20, 'measure': 'гр'}]}
if key == 'salad':
grams = cook_book.get('salad')[0].get('quantity') * 5
print(grams)
multiply_dict('salad')
def replace_stnum(stnums, students, replacing_num):
for student in students:
for stnum in stnums:
if stnum in student:
student[0] = replacing_num
return students
print(replace_stnum(stnums, students, replacing_num))
plt.plot(data.index, data['nutri'] - data['mineral'], color = 'red', linewidth = 2, label = 'difference')
plt.xticks(data.index, data['year'], rotation = 90)
plt.xlabel('Year')
plt.ylabel('Profit')
plt.legend()
plt.show()
import pandas as pd
# create a new dataframe
dfs = df[['song','performer','chart_debut','peak_position','worst_position','time_on_chart','consecutive_weeks']]
# convert the chart_debut to string
dfs['chart_debut'] = dfs['chart_debut'].astype(str)
# get the date
date = dfs.chart_debut.str.split("-", n = 1, expand = True)
# drop the chart_debut
dfs.drop(columns =['chart_debut'], inplace = True)
# insert the date
dfs.insert(2,'chart_debut',date[0])
# show the top 5 rows
print(dfs.head())
def df_to_float(df):
try:
df = df.astype(float)
return df
except:
pass
def cook_book():
portions = 5
for key, value in cook_book.items():
for sub_dict in value:
for v in sub_dict.values():
ingridient_name = v[0]
quantity = v[1]
measure = v[2]
grams = quantity * portions
print(grams)
return grams
def line_graph(dataframe):
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.plot(dataframe.columns, dataframe.iloc[0])
ax.set_xlabel('years')
ax.set_ylabel('columns')
ax.set_title('line graph')
df['profit_perc'] = df['profit'] / df['revenue']
def change_shelf(data, docnum, shelf):
if not shelf in data:
return'ERROR NO SUCH KEY'
for val in data.values():
if docnum in val:
val.append(docnum)
print('OK')
return data
return 'ERROR NO SUCH VALUE'
directories = {
'1': ['2207 876234', '11-2'],
'2': ['10006'],
'3': []
}
def compare_gdp_and_happiness(df1, df2):
df_merge = df1.merge(df2, how='inner', left_on='Country or region', right_on='entity')
df_merge = df_merge[['Country or region', 'GDP per capita', 'happinessScore']]
df_merge['GDP per capita'] = df_merge['GDP per capita'].apply(pd.to_numeric)
df_merge['happinessScore'] = df_merge['happinessScore'].apply(pd.to_numeric)
df_merge_sort = df_merge.sort_values(by='happinessScore', ascending=False)
#df_merge_sort
df_merge_top_1 = df_merge_sort.head(20)[0:1]
df_merge_top_1.rename(columns={'Country or region': 'Top1', 'GDP per capita': 'Top1 GDP', 'happinessScore
def guessBlue(blue_start, red_start, blue_pulled, red_pulled):
return (blue_start - blue_pulled)/(blue_start - blue_pulled + red_start - red_pulled)
def get_year(date):
return int(date[:4])
def get_month(date):
return int(date[5:7])
def get_day(date):
return int(date[8:])
df['year'] = df['date'].apply(get_year)
df['month'] = df['date'].apply(get_month)
df['day'] = df['date'].apply(get_day)
fvalue, pvalue = stats.f_oneway(p.Speed, p2.Speed)
df[df['peak_position'] == 1].drop_duplicates(subset = 'song', keep = 'first')
def try_to_int(df):
for col in df.columns:
try:
df[col] = df[col].astype('Int64')
except:
pass
import pandas as pd
def split_data(data):
# data is a list of tuples
# return a tuple of four lists
# X_train, X_test, y_train, y_test
df = pd.DataFrame(data, columns=['X', 'y'])
train, test= train_test_split(df, test_size=0.3, random_state=42)
return (train['X'].tolist(), test['X'].tolist(), train['y'].tolist(), test['y'].tolist())
split_data([(1, 2), (3, 4), (5, 6)])
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame( {'place': [1, 2, 3, 4, 5],
'decade': ['2010-2020', '1900-1910', '1970-1980', '2000-2010', '1960-1970'],
'perc_of_5star': [2.3, 1.379, 1.179, 1.176, 1.133]})
df.sort_values(by='perc_of_5star', ascending=True).plot(kind='barh', x='decade', y='perc_of_5star', legend=False)
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf,
cats,
test_size=0.3,
random_state=42)
import pandas as pd
import numpy as np
df = pd.DataFrame({'name': ['Taylor Swift', 'Imagine Dragons', 'Ed Sheeran'],
'year': [2008, 2012, 2014]})
def decade_equals(year):
return year // 10 * 10
df.groupby(decade_equals)['name'].apply(list)
stnums = ['4004']
students = [
['0001', 'Antonov', 'Anton', 'Igorevich', '20.08.2009', 'BST161'],
["1102", "Bogov", "Artem", "Igorevich", "25.01.2010", "BST162"]
["0333", "Glagoleva", "Anastasiya", "Nikolaevna", "11.07.2009", "BST163"]
["4004", "Stepanova", "Natalia", "Aleksandrovna", "13.02.2008", "BST161"]
["0045", "Bokov", "Igor", "Kharitonovich", "02.06.2009", "BST161"],
["0096", "Vasil'kov", "Valentin", "Sergeevich", "20.03.2009", "BST164"],
["0607", "Siropova", "Violetta", "Eduardovna", "28.05.2010", "BST162"],
["4004
def split_func(a):
import json
import re
a = json.loads(a.strip())
purchases = {}
for i, line in enumerate(a):
line = re.split(r",|:", str(line))
keys = line[1]
values = line[3]
purchases[keys] = values
return purchases
split_func(a)
def multiple_of_index(arr):
return [num for i, num in enumerate(arr) if num % i == 0 and i != 0]
def value_counts(df, year):
if df[df['rating'] == 5.0]:
return df["year"].value_counts()
else:
return "no movie with 5.0 rating in this year"
import pandas as pd
news = pd.read_csv('https://raw.githubusercontent.com/ml-mipt/ml-mipt/basic/homeworks/Lab1_python_basics/news_sample.csv')
news[news['news_title'].str.contains('[0-9]{8}-[a-z]+')]
# First solution
def transpose(matrix):
new_matrix = [[] for _ in range(len(matrix[0]))]
for i in range(len(matrix[0])):
for j in range(len(matrix)):
new_matrix[i].append(matrix[j][i])
return new_matrix
def NAWE(country):
NA = ['Canada', 'United States']
WE = ['United Kingdom', 'Germany', 'Netherlands']
if country in NA or country in WE:
return False
return True
df = df[df['birth_country'].apply(NAWE)]
import pandas as pd
import numpy as np
def func(a, b, c):
a = pd.DataFrame(a)
b = pd.DataFrame(b)
c = pd.DataFrame(c)
a['treatment'] = 'A'
b['treatment'] = 'B'
c['treatment'] = 'C'
a['index'] = np.arange(len(a))
b['index'] = np.arange(len(b))
c['index'] = np.arange(len(c))
a = a.rename(columns={0: "value"})
b = b.rename(columns={0: "value"})
c = c.rename(columns={0: "value"})
a = a[['index', 'treatment', 'value']]
b = b[['index', 'treatment', 'value']]
c = c[['index', 'treatment', 'value']]
d =
#plot the grid
plt.grid(axis='y', alpha=0.75)
def guessBlue(blueStart, redStart, bluePulled, redPulled):
return (blueStart - bluePulled) / (blueStart - bluePulled + redStart - redPulled)
guessBlue(5, 5, 2, 3)
def load_json(line):
import json
from json.decoder import JSONDecodeError
try:
line = json.loads(line)
except JSONDecodeError:
line = {}
return line
f = open('purchase_log.txt', encoding='utf-8')
purchases = {}
for i, line in enumerate(f):
line = line.strip().split(',')
keys = line[1]
values = line[3]
purchases[keys] = values
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(1)
A = np.random.normal(0, 1, 50)
B = np.random.normal(1, 1, 50)
def rate_group(x):
if x > '15.00%':
return '>15'
if (x >= '10.00%') & (x <= '15.00%'):
return '10-15'
df['rate_group'] = df['int_rate'].apply(lambda x: rate_group(x))
def find_difference(a, b):
return abs(reduce(lambda x,y: x*y, a) - reduce(lambda x,y: x*y, b))
speed = np.array([0.0, 83.6, 98.3, 108.5, 124.9, 141.7, 161.3, 187.3, 197.8, 217.1, 232.6, 250.1])
fvalue, pvalue = stats.f_oneway(speed[:6], speed[6:])
# Solution
fvalue, pvalue = stats.f_oneway(speed[:6], speed[6:])
print(pvalue)
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
df_new.groupby(['route', 'incident_type']).size().sort_values(ascending=False).head()
def group_by_performer(df):
# make a copy of df
df_copy = df.copy()
# sort by performer
df_copy.sort_values(by='performer', inplace=True)
# group by song and take the unique performers
df_copy['hits'] = df_copy.groupby('song')['performer'].transform(lambda x: ','.join(x.unique()))
# drop duplicates and reset index
df_copy.drop_duplicates(subset='performer', inplace=True)
df_copy.reset_index(inplace=True)
return df_copy
group_by_performer(df)
def total_ingredients(cook_book):
dish = 'salad'
portions = 5
grams = cook_book['quantity'] * portions
if key == 'salad':
grams = 'quantity' * portions
print(grams)
cook_book = {'salad': [{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'g'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pcs'},
{'ingridient_name': 'cucumbers', 'quantity': 100, 'measure': 'g'}]}
total_ingredients(cook_book)
import numpy as np
def sum_matrix(N):
my_matrix = np.diag(np.arange(N, 0, -1))
return my_matrix.sum()
sum_matrix(5)
sum_matrix(10)
sum_matrix(15)
def anova_1way(a, b):
fvalue, pvalue = stats.f_oneway(a, b)
if pvalue < 0.05:
print("Reject Null Hypothesis - Significant differences exist between group means")
else:
print("Fail to Reject Null Hypothesis - No significant difference between group means")
anova_1way(p.Speed, boosted_p.Speed)
def create_scaled_data(sl, sw):
scaled_data_1 = (sl - min(sl))/(max(sl) - min(sl))
scaled_data_2 = (sw - min(sw))/(max(sw) - min(sw))
scaled_data = pd.DataFrame({'sl': scaled_data_1, 'sw': scaled_data_2})
return scaled_data
scaled_data = create_scaled_data(sl, sw)
question6 = 'What is the most preferred working environment for you.'
question6 = df[question6].value_counts()
label = question6.index
counts = question6.values
fig = px.bar(x=label, y=counts, orientation='h')
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
# Your code here
df.sort_values('name', ascending=False)
def sort_df(df):
return df.groupby('decade').rating.value_counts().sort_index()
def add(arr):
res = 0
for num in arr:
res += num
return res
add([2, 1, 10, 5])
def vowel_2_index(inp):
for i in range(len(inp)):
if type(inp[i]) == int:
if inp[i] == 97:
inp[i] = "a"
elif inp[i] == 118:
inp[i] = "u"
elif inp[i] == 105:
inp[i] = "i"
elif inp[i] == 111:
inp[i] = "o"
elif inp[i] == 101:
inp[i] = "e"
return inp
inp = [118, "u",120,121,"u",98,122,"a",120,106,104,116,113,114,113,120,106 ]
vowel_2_index(inp)
#df_new.route.value_counts()
def df_count(dataframe, column):
return dataframe[column].value_counts()
df_count(df_new, 'route')
def get_ingredients(dish, portions):
recipes = cook_book.get(dish, [])
if not recipes:
print('No such dish')
return
print(f'You need to cook {dish} for {portions} portions')
for recipe in recipes:
print(f'{recipe["ingridient_name"]}: {recipe["quantity"] * portions} {recipe["measure"]}')
def remove_percent(row):
if row.endswith("%"):
return float(row.replace("%", ""))
else:
return float(row)
df.rate_group = df.rate_group.map(remove_percent)
import pandas
df.sort_values(by=['time_on_chart', 'max'], ascending=False).head(20)
import pandas as pd
df = pd.DataFrame(data, columns=['performer', 'hits', 'chart_debut'])
def divide_decade(df, decade):
return df[(df['chart_debut'] >= int(decade[0:4])) & (df['chart_debut'] <= int(decade[5:9]))]
divide_decade(df, '1980-1990')['performer']
def goes_after(word, first, second):
for i in range(len(word)):
if word[i] == first:
if word[i+1] == second:
return True
else:
return False
else:
return False
import matplotlib.pyplot as plt
a = [1,3,3,4,4,4]
b = [2,3,4,4,4,4,4]
plt.hist(a, alpha=0.5, color='b')
plt.hist(b, alpha=0.5, color='r')
plt.show()
import numpy as np
def create_array(N):
return np.arange(N-1, -1, -1)
create_array(10)
def get_movie_rating():
df = pd.read_csv(MOVIE_DATA_FILE_PATH)
df2 = df.groupby(['title'])['rating'].mean().round(1)
return df2
get_movie_rating()
def search_and_replace(lst, new_id):
for i in lst:
if i[0] == new_id:
i[0] = '9090'
print(lst)
search_and_replace(lst, '4004')
def train_test_split(X, y, test_size=0.3, random_state=42):
"""
Input:
X: A pandas dataframe.
y: A pandas series. Corresponds to the labels.
test_size: Test size as a fraction of the total dataset.
"""
# YOUR CODE HERE
raise NotImplementedError()
return X_train, X_test, y_train, y_test
from math import sqrt
def distance(a, b):
return sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
from gensim import similarities
cos_sim = similarities.MatrixSimilarity(tfidf[bows])
import math
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def distance(a, b):
return round(math.sqrt((b.x - a.x)**2 + (b.y - a.y)**2), 6)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
def upgrade(num):
if num == 777: money += 200; successes += 1
elif num == 999: money += 100; successes += 1
elif num == 555: money += 50; successes += 1
elif num == 333: money += 15; successes += 1
elif num == 111: money += 10; successes += 1
elif ends77(num) == True: money += 5; successes += 1
elif ends7(num) == True: money += 3; successes += 1
elif ends00(num) == True: money += 2; successes += 1
elif ends0(num) == True: money += 1; successes += 1
else: money -= 1
return money, successes
import pandas as pd
df = pd.read_csv('https://stepik.org/media/attachments/course/4852/accountancy.csv')
def show_all_yandex(df):
return df.loc[df['Executor'] == 'Yandex']
def func(delta, sigsqr, conf):
return round(sigsqr * conf ** 2 / delta ** 2)
def distinct(seq):
return list(set(seq))
from statsmodels.stats.multicomp import pairwise_tukeyhsd
tukey = pairwise_tukeyhsd(endog = df['A'], # Data
groups = df['B'], # Groups
alpha=0.05) # Significance level
tukey.summary()
fig = tukey.plot_simultaneous(comparison_name="C")
fig.show()
def make_dict(my_list):
result = {}
if len(my_list) > 2:
result[my_list[0]] = make_dict(my_list[1:])
else:
result[my_list[0]] = my_list[1]
return result
my_list = ['a', 'b', 'c']
print(make_dict(my_list))
def distance(a, b):
return math.sqrt((a.x - b.x) ** 2 + (a.y - b.y) ** 2)
def decade_of_the_movie(movie_rating)
if movie_rating > 5:
print("Most movies were in the decade")
elif movie_rating <= 4:
print("Most movies were in the decade")
else:
print("No movies were found")
decade_of_the_movie(5)
import pandas as pd
from pandas import DataFrame
sl = [[-0.90068117],
[-1.14301691],
[-1.38535265],
[-1.50652052],
[-1.02184904],
[-0.53717756],
[-1.50652052],
[-1.02184904],
[-1.74885626],
[-1.14301691]]
sw = [[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.]]
data = {'SL': sl, 'SW': sw}
print(pd.DataFrame(data))
def del_col(data_frame):
data_frame.drop(['spi_rank', 'country'], axis=1, inplace=True)
return data_frame
def duplicates(data):
#your code here
result = duplicates(data)
def group_list(grnum):
group_list = []
for key in dct:
if grnum in key:
group_list.append(' '.join(dct[key][0:3]))
group_list.sort()
return group_list
group_list('BST161')
df.loc[(lambda x: x["revenue"] > 40000) & (lambda x: x["total_rooms"] < 30)]
def check_id(lst):
lst = list(set(lst))
count = 0
for i in lst:
if i in lst[i + 1:]:
print(i)
count += 1
if count == 0:
print("Non-repeating elements")
df.plot(kind='box', subplots=True, layout=(4,2), sharex=False, sharey=False, figsize=(10, 10))
plt.show()
def find_questions(df):
df2 = df[df == '?']
return df2
find_questions(data)
data = [['2010-05-21',100000], ['2011-01-30',50000], ['2011-03-16',100000], ['2011-04-11',5000000], ['2011-04-16',50000], ['2011-04-18',50000], ['2011-05-12',100], ['2011-06-23',1000], ['2011-06-26',5000], ['2011-06-29',1000000], ['2011-07-10',5000000], ['2011-09-20',50000], ['2011-09-22',1000000]]
df = pd.DataFrame(data, columns = ['Last Updated', 'Installs'])
def date_to_year(a):
return pd.to_datetime(a).year
df['year'] = df.apply(lambda x: date_to_year(x['Last Updated']), axis=1)
# import pandas
import pandas as pd
# read csv file
df_new = pd.read_csv('file.csv')
# calculate mean of the "date" column for each "operator"
df_new.groupby('operator')['date'].mean()
gdpdiff[['Country or region', 'GDP per capita']].plot(kind='bar', x='Country or region', y='GDP per capita', color='purple')
def grouped_operators(df):
return df.groupby(["operator"])["date"].sum()
import pandas as pd
df = pd.read_csv('studentscores.csv')
def clean_data(df):
for index, row in df.iterrows():
for i, item in enumerate(row):
if type(item) == str:
if '%' in item:
df.iloc[index, i] = item.replace('%', '').replace(',', '')
else:
df.iloc[index, i] = float('NaN')
return df
clean_data(df)
def how_much_water(water, load, clothes):
return water * 1.1 ** (clothes - load)
def sum_all(arr):
return sum([i+j for i in arr for j in arr if i < j])
sum_all(arr)
def GraphMaker(vec1, vec2, angle):
#Implement me
return ax
GraphMaker(film_2, film_4, 65.9)
import matplotlib.pyplot as plt
import numpy as np
# we need 2 vectors
film_2 = np.array([0,1,0,1,0,0])
film_4 = np.array([1,0,0,1,0,1])
# We need to know how to plot vectors
# We need to know how to get the angle between them
# We need to know how to make a graph
def find_multiples(integer, limit):
return [integer*i for i in range(1, limit+1) if integer*i <= limit]
def words():
query = []
while True:
word = input("Enter word: ")
if word == "end":
print("Ending")
break
else:
query.append(word)
print(" ".join(query))
words()
df.loc[lambda x: (x['revenue'] > 40000) & (x['total_rooms'] < 30)]
import pandas as pd
import re
s = pd.Series(['10','78','54','GOOD','64','23'])
def find_non_numbers(s):
return s[~s.str.replace('\d+', '').str.contains('\d')]
find_non_numbers(s)
def sale_hotdogs(n):
if n < 5:
return n * 100
elif n < 10:
return n * 95
else:
return n * 90
df = pd.read_csv('customers.csv')
df.sort_values(by='name', inplace=True)
#df[(df['name'].duplicated(keep=False))].sort_values(by='name', inplace=True)
df[(df['name'].duplicated(keep=False))].sort_values(by='name').head(10)
df=pd.read_clipboard()
df2 = df[df=='?']
print("the original dataframe:")
print(df)
print("the new dataframe:")
print(df2)
def mean_str_len(data, column1, column2):
# data - a dataframe
# column1 - a string, the name of a column
# column2 - a string, the name of another column
# YOUR CODE HERE
...
def get_needed_posts(query):
site = pd.DataFrame()
for q in query:
URL = parseurl+'search/'
params = {
'q': q
}
req = requests.get(URL, params=params)
time.sleep(0.3)
soup = BeautifulSoup(req.text)
articles = soup.find_all('article', class_='tm-articles-list__item')
for article in articles:
try:
title = article.find('h2', class_='tm-article').text
date = article.find('span', class_='tm-article').text.strip()
link = article.find('h2', class_='tm-article').find('a').get('href')
if title not in site.title.values and link not in site.link.values:
row = {'date': date, 'title': title, 'link': 'https://habr.com'+link}
site = pd.concat([site
def confidence_interval(n, mean, sig, conf):
sig = sig / (n ** (1/2))
left = round(mean - sig * norm.ppf((1+conf)/2))
right = round(mean + sig * norm.ppf((1+conf)/2))
return right - left
def interval(n, mean, sig, conf):
h = sig * t.ppf((1 + conf) / 2, n - 1)
return round(mean - h), round(mean + h), round(mean)
interval(n=20, mean=3.3, sig=2.4, conf=0.95)
import pandas as pd
df = pd.DataFrame({'time_on_chart': [1,1,1,1],
'max': [11,10,1,20]}, index=['"Groove" Holmes', '"Little" Jimmy Dickens', '"Pookie" Hudson', '"Weird Al" Yankovic'])
df.sort_values(['time_on_chart', 'max'], ascending=False)
import numpy as np
def dia(N):
my_matrix = np.diag(np.arange(N), k=-1)
return np.trace(my_matrix)
print(dia(4))
print(dia(12))
print(dia(15))
def fill_na_by_corr(df, col_name='rectal_temp'):
df_temp = df[df.columns[df.isna().any()].tolist()].drop(['outcome'], axis = 1)
df_temp = df_temp.dropna(subset = [col_name])
cor = df_temp.corr()[[col_name]].drop([col_name])
for i in cor.index:
if cor.loc[i, col_name] > 0:
df[i] = df[i].fillna(value = df[i].mean() * cor.loc[i, col_name])
else:
df[i] = df[i].fillna(value = df[i].mean() / cor.loc[i, col_name])
return df
def add(a, b):
return a + b
add(1, 2)
from scipy.stats import ttest_ind
import numpy as np
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])
ttest_ind(a, b)
def gen_id(row):
return str(row.id) + '-' + str(row.hour)
df['id'] = df.apply(gen_id, axis=1)
def rounder(x):
if x < 1:
return 1
if x > 4:
return 4
else:
return round(x)
df2['respiratory_rate'] = df2['respiratory_rate'].apply(rounder)
def is77(x):
str_x = str(x)
if str_x[-2:] == '77' and str_x != '777':
return True
else:
return False
is77(77)
def replace_non_numbers(df, col):
df[col] = df[col].replace(df[col][df[col].str.contains('[^0-9]')], np.nan)
def collatz(number):
if number % 2 == 0:
print(number // 2)
return number // 2
elif number % 2 == 1:
result = 3 * number + 1
print(result)
return result
n = input("Enter number: ")
while n != 1:
n = collatz(int(n))
import re
def convert_size(size):
if size[-1] == 'M':
return int(float(size[:-1]) * 1000000)
elif size[-1] == 'k':
return int(float(size[:-1]) * 1000)
else:
return int(size)
replaces = lambda x: convert_size(x)
data['Size'] = data['Size'].apply(convert_size)
df[df['text'].str.contains('\d{8}-')]
def my_fun(df, var0, var1, var2):
return df.groupby([var0, var1])[var2].max().reset_index()
my_fun(df, 'traffic_source', 'region', 'source_type')
df = pd.DataFrame(data = [[1, 31, 2.5, 1260759144], [1, 1029, 3.0, 1260759179], [1, 1061, 3.0, 1260759182], [1, 1129, 2.0, 1260759185], [1, 1172, 4.0, 1260759205]],
columns = ['userId', 'movieId', 'rating', 'timestamp'])
def average_lifetime(df):
'''
df: input dataframe
'''
df_max = df.groupby(['userId']).max()
df_min = df.groupby(['userId']).min()
df_final = pd.merge(df_max, df_min, on = ['userId'], suffixes = ('_max', '_min'))
df_final['average_lifetime'] = df_final['timestamp_max'] - df_final['timestamp_min']
return df_final
df = average
def round(a):
return int(a + 0.5)
round(2.5)
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Initialize the figure
f, ax = plt.subplots(figsize=(6.5, 6.5))
# Load the example car crash dataset
crashes = sns.load_dataset("car_crashes").sort_values("total", ascending=False)
# Plot the total crashes
sns.set_color_codes("pastel")
sns.barplot(x="total", y="abbrev", data=crashes,
label="Total", color="b")
# Plot the crashes where alcohol was involved
sns.set_color_codes("muted")
sns.barplot(x="alcohol", y="abbrev", data=crashes,
label="Alcohol-involved", color="b")
# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(x
df['country'] = df['country'].astype(str)
def get_dataframe():
sl = [1, 2, 3, 4, 5]
sw = [6, 7, 8, 9, 10]
import pandas as pd
return pd.DataFrame(data = [sl, sw], index = ['sl', 'sw']).transpose()
get_dataframe()
def find_difference(a, b):
return max(a) * max(b)
find_difference([1, 2, 3], [4, 5, 6])
plt.ylim(3,5)
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
year_leaders(dfp)
# df2 Pandas dataframe is given
def fillna_median(df2):
df2['rectal_temp'] = df2['rectal_temp'].fillna(df2['rectal_temp'].median())
return df2
fillna_median(df2)
def colmax(df):
maxvalues = []
for col in df.columns:
maxvalues.append(df[col].max())
return pd.DataFrame(maxvalues, index=df.columns).T
from datetime import datetime as dt
from datetime import timedelta as td
def date_range(start_date, end_date):
result = []
if start_date > end_date:
return result
else:
while start_date <= end_date:
result.append(start_date.strftime('%Y-%m-%d'))
start_date += td(days=1)
return result
date_range('1992-09-01', '1992-09-04') # ['1992-09-01', '1992-09-02', '1992-09-03', '1992-09-04']
from math import factorial
def wilson_primes(p):
if (factorial(p-1)+1)%(p*p) == 0 and p>1:
return True
return False
wilcox.test(d$Sepal.Length, d$Petal.Length, paired = FALSE)
df.groupby('song').first().reset_index()
import pandas as pd
import re
df = pd.DataFrame({'loan_amnt': [5000, 2500, 2400, 10000, 3000, 5000, 7000], 'int_rate': ['10.65%', '15.27%', '15.96%', '13.49%', '12.69%', '7.90%', '15.96%']})
df['rate_group'] = None
def rate_group(a):
if a > 15.00:
return '>15'
elif a >= 10.00 and a <= 15.00:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(lambda x: (re.sub('%', '', x)))
df['rate_group'] = df['rate_group'].apply(lambda x: float(x))
df['rate_group'] = df['rate_group'].apply(rate_group)
display(df)
my_string = "How much is this going to cost?"
# returns false
"%" in my_string
# returns true
"%" in "Price: $100%"
def howManyLightSabersDoYouOwn(name="anyone else"):
if name == "Zach":
return 18
else:
return 0
def average_temperature(df):
avg_temp_by_country = df.groupby(['countrry', 'decade'])['av_temp'].mean().reset_index()
coldest_countries = avg_temp_by_country.sort_values('av_temp', ascending = True).head(20)
coldest_countries_list = list(coldest_countries['country'])
return avg_temp_by_country, coldest_countries_list
import uuid
df_new['id'] = df_new.apply(lambda row: uuid.uuid4(), axis=1)
def max_key(dct, key):
#todo
from math import sqrt
def interval(n, mean, sig, conf):
t = 1.96 # for conf = 0.95
h = t * sig/sqrt(n)
return round(h)
interval(100, 6, 2, 0.95)
import pandas as pd
df = pd.read_csv('ratings.csv')
grouped = df.groupby('userId')
top_users = grouped.filter(lambda x: len(x) >= 100)
def multiply_cook_book(portions):
new_book = {}
for key, value in cook_book.items():
new_book[key] = []
for d in value:
new_book[key].append({'ingridient_name': d['ingridient_name'],
'quantity': d['quantity'] * portions,
'measure': d['measure']})
return new_book
print(multiply_cook_book(2))
def get_year(row):
year = row['title'].extract()
return "year"
df.apply(get_year)
import pandas as pd
def find_us(val):
if val.startswith('agg'):
val = 'aggregators'
elif val.startswith('vk_adv') or val.startswith('facebook_adv') or val.startswith('instagram_adv') or val.startswith('telegram_adv'):
val = 'social'
return val
df = pd.read_csv('/datasets/how_find_us.csv')
df['how_find_us'] = df['how_find_us'].apply(find_us)
df['how_find_us'].value_counts()
def year_leaders(df):
leders = df.groupby(by=['chart_debut']).agg({'num_of_hits': 'max'}).reset_index()
print(leders)
return df.merge(leders, left_on=['chart_debut', 'num_of_hits'], right_on=['chart_debut', 'num_of_hits'], how='inner')
year_leaders(df)
def group_by_performer(df):
pass
data = loadmovies()
pdata = pd.DataFrame(data)
grouped = pdata.groupby('userId')['timestamp'].agg(['max', 'min'])
grouped['diff'] = grouped['max'] - grouped['min']
grouped[pdata.groupby('userId')['rating'].count() > 100].mean()
def add_x_axis_labels(fig, x, labels):
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = x,
ticktext = labels
)
)
return fig
add_x_axis_labels(fig, counts, label)
def show_bar_plot(x, y, title):
fig = px.bar(x=x, y=y, orientation='h')
fig.update_layout(title_text=title)
fig.show()
show_bar_plot(counts, label, 'Your title')
def get_id(x):
return x['user_id'] + '_' + x['item_id']
df['id'] = df.apply(get_id, axis=1)
def get_year(string):
return string.split()[-1].strip(')')
get_year('Pulp Fiction (1994)')
def is_numeric(s):
return bool(re.search(r'^(\d+)$', s))
df1 = pd.DataFrame({"Reviews": ["1", "2", "3", "4", "5", "apple", "orange"]})
df1[~df1["Reviews"].apply(is_numeric)]
misclassified = []
for index, label_predicted in enumerate(predicted):
if label_predicted != y_test[index]:
misclassified.append({'message': X_test[index], 'actual': y_test[index], 'predicted': label_predicted})
misclassification_df = pd.DataFrame(misclassified)
import math
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def distance(a, b):
return math.sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
test.assert_approx_equals(distance(Point(0,0), Point(0,1)), 1)
test.assert_approx_equals(distance(Point(0,0), Point(1,0)), 1)
test.assert_approx_equals(distance(Point(0,0), Point(3,4)), 5)
def how_much_water(water, load, clothes):
if clothes <= load:
return water
else:
return water * 1.1 ** (clothes - load)
def find_duplicates(lst):
stnums = []
cnt = {}
for el in lst:
stnums.append(el[0])
for i in stnums:
if i in cnt:
cnt[i] += 1
else:
cnt[i] = 1
return cnt
def change_duplicates(lst, replacing_num):
stnums = []
duplicates = find_duplicates(lst)
for k, v in duplicates.items():
if v > 1:
stnums = [k]
for student in lst:
if stnums in lst:
#need right code#
def guess_blue(blue_start, red_start, blue_pulled, red_pulled):
total_start = blue_start + red_start
total_pulled = blue_pulled + red_pulled
return (blue_start - blue_pulled) / (total_start - total_pulled)
def first(seq, n=1):
if n == 0:
return []
else:
return seq[:n]
lst = [['a', 1, 3], ['b', 2, 4], ['c', 5, 6]]
{el[0]: el[1:] for el in lst}
fig = px.bar(x=counts, y=label, orientation='h')
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
import pandas as pd
import numpy as np
df = pd.DataFrame({'Category':['ham','ham','spam','ham','ham'], 'Message':['Go until jurong point, crazy... Available only ...','Ok lar... Joking wif u oni...','Free entry in 2 a wkly comp to win FA Cup fina...','U dun say so early hor... U c already then say...','Nah I don\'t think he goes to usf, he lives aro...']})
df['Category'] = df['Category'].map(lambda x : 1 if x == 'ham' else 0)
import requests
def get_all_files_names(url):
links = []
response = requests.get(url)
html_doc = response.text
soup = BeautifulSoup(html_doc, 'html.parser')
for link in soup.find_all('a'):
links.append(link.get('href'))
links = list(filter(lambda x: x.endswith('.txt'), links))
return links
get_all_files_names('http://stup.pro/wp-content/uploads/2023/03/')
df_new.groupby(['route', 'incident_type']).size()
def filter_1980(df):
df["year"] = df["year"].apply(lambda x: x > 1980)
return df
def avg_temp(df):
return df["av_temp"].mean()
def list_coldest(df, n=20):
df = df.sort_values(by="av_temp")
return list(df.sample(n)["country"])
new_df = filter_1980(df)
avg_temp(new_df)
list_coldest(new_df)
def add_stuff(a, b):
return a + b
add_stuff(1, 2)
def foo(x):
return x**2
x = range(5)
list(x)
def test_result(drugA, drugB):
return
def add_Columns(df):
df.columns = ['A', 'B', 'C', 'D']
return df
add_Columns(df)
import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows=1, ncols=2)
df.plot(kind='scatter',
x='User_Score',
y='Global_Sales',
ax=axes[0])
df.plot(kind='scatter',
x='Critic_Score',
y='Global_Sales',
ax=axes[1])
plt.show()
def replace_non_numbers(df, column):
df[column] = df[column].astype(str).str.replace('[^0-9]', '')
return df[column].astype(float)
#replace_non_numbers(df, 'loan_amnt')
# pandas dataframe
df = pd.DataFrame({'name': ['Ksenia Rodionova',
'Ulyana Selezneva',
'Konstantin Prokhorov',
'Petrov Vladimir',
'Arina Selivanova',
'Artur Petrov',
'Ivan Sidorov',
'Ksenia Rodionova',
'Ksenia Rodionova'],
'date': ['2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01']})
df.head(10)
df_new = df.loc[df.duplicated(subset=['name'], keep=False), :]
df_new
def compare_two_groups(x, y):
fvalue, pvalue = stats.f_oneway(x.Speed, y.Speed)
if pvalue < 0.05:
return True
else:
return False
def multiple_of_index(arr):
if len(arr) <= 1:
return []
else:
new_arr = []
for i in range(len(arr)):
if i * arr[i] == 0 and arr[i] != 0:
new_arr.append(arr[i])
return new_arr
def vertical_bar_chart(question):
question = data[question].value_counts()
label = question.index
counts = question.values
colors = ['gold', 'lightgreen']
fig = go.Figure(data=[go.Bar(x=label, y=counts, marker_color=colors)])
fig.update_layout(title_text=question)
fig.show()
vertical_bar_chart(question6)
import json
def normal_split(data):
return data.split(',')
purchases = {}
for i, line in enumerate(f):
line = json.loads(line.strip())
keys = line['user_id']
values = line['category']
purchases[keys] = values
def replace_number(lst):
for row in lst:
if row[0] == "4004":
row[0] = "9090"
return lst
def decade(year):
if year >= 1900 and year <= 1910:
return "1900-1910"
elif year > 1910 and year <= 1920:
return "1910-1920"
elif year > 1920 and year <= 1930:
return "1920-1930"
elif year > 1930 and year <= 1940:
return "1930-1940"
elif year > 1940 and year <= 1950:
return "1940-1950"
elif year > 1950 and year <= 1960:
return "1950-1960"
elif year > 1960 and year <= 1970:
return "1960-1970"
elif year > 1970 and year <= 1980:
return "1970-1980"
elif year > 1980 and year <= 1990:
return "1980-1990"
elif year > 1990 and year <= 2000:
return "1990-2000"
elif year > 2000 and year <= 2010:
return "2000-2010"
elif year > 2010 and year <= 2020:
return "2010-2020
def round_to_nearest(a):
return round(a)
round_to_nearest(2.5)
def split_data(df, y, test_size=0.2, random_state=42):
X = df
y = y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
return X_train, X_test, y_train, y_test
split_data(tfidf, cats)
def rate_group(int_rate):
int_rate = re.sub('%', '', int_rate)
int_rate = float(int_rate)
if int_rate > 15.0:
return '>15'
elif 10.0 < int_rate <= 15.0:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
def is_opposite(s1, s2):
if not s1 and not s2:
return False
if s1 and s2:
return s1.swapcase() == s2
return False
def rand_seq(length):
import random
seq = ""
while True:
if "3" in seq and "7" in seq:
break
seq = "".join([str(random.randint(0, 9)) for num in range(length)])
return seq
rand_seq(15)
def plot_difference_by_hotel(df):
sns.factorplot(
x='date', y='difference', col='hotel',
data=df, kind='bar')
plot_difference_by_hotel(hotels_rev)
def num_of_hits(df):
df['num_of_hits'] = df.hits.apply(lambda x: len(x.split(', ')))
return df.sort_values(by='num_of_hits', ascending=False)
num_of_hits(df)
def find_max(dictionary):
"""
Dictionary -> String
:param dictionary: example {'AUD': {'ID': 'R01010',
'NumCode': '036',
'CharCode': 'AUD',
'Nominal': 1,
'Name': 'Australian Dollar',
'Value': '46.9983,
'Previous': 45.9496},
'AZN': {'ID': 'R01020A',
'NumCode': '944',
'CharCode': 'AZN',
'Nominal': 1,
'Name': 'AZN',
'Value': 41.4856,
'Previous': 40.5904},
'GBP': {'ID': 'R01035',
'NumCode': '826',
'CharCode': 'GBP',
'Nominal': 1,
'Name': 'Pound Sterling United Kingdom',
'Value': 85.
def dish(dish, portions):
cook_book = {
'пицца': [
{'ingridient_name': 'сыр', 'quantity': 20, 'measure': 'гр'},
{'ingridient_name': 'колбаса', 'quantity': 30, 'measure': 'гр'},
{'ingridient_name': 'бекон', 'quantity': 30, 'measure': 'гр'},
{'ingridient_name': 'оливки', 'quantity': 10, 'measure': 'гр'},
],
'лимонад': [
{'ingridient_name': 'лимон', 'quantity': 1, 'measure': 'шт'},
{'ingridient_name': 'вода', 'quantity': 200, 'measure': 'мл'},
{'ing
def find_obj_str(df):
obj_str = df.select_dtypes(['object', 'str'])
return obj_str
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
ax[0].plot(df[['User_Score', 'Global_Sales']],
df['Global_Sales'].max())
ax[1].plot(df[['Critic_Score', 'Global_Sales']],
df['Global_Sales'].max())
import pandas as pd
df = pd.DataFrame(
{'id': [4728, 35638, 21445, 40291, 29462, 46978, 42931, 38670, 25506, 43989],
'title': ['Bad Education (2019)', 'Palooka (1934)', 'High Moon (2019)', 'Saint Maud (2019)',
'Mad at the Moon (1992)', 'The Butterfly Ball (1977)',
'Snowboarďáci (2004)', 'Recon 2020: The Caprini Massacre (2004)',
'Karagoez catalogo 9,5 (1983)', 'Starting a Skyscraper (1902)'],
'rating': [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
'decade': ['2010-2020', '1934', '2010-2020', '2010-2020',
'1990-2000', '1970-1980', '
def group(df):
return df.groupby(['performer'])['song'].apply(', '.join).reset_index()
def get_source_type(traffic_source, region):
return df[(df.traffic_source == traffic_source) & (df.region == region)].source_type.values
get_source_type('twitter', 'east')
def make_numeric(df):
for col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
def sort_by(df, column, ascending=True):
return df.sort_values(column, ascending=ascending)
def multiply(dictionary):
for i in dictionary[key]:
i['quantity'] = i['quantity'] * portions
return dictionary
multiply(cook_book)
year_leaders = df.groupby('chart_debut').num_of_hits.transform(max) == df['num_of_hits']
df[year_leaders][['performer', 'num_of_hits']]
def fix_data(X_train, X_test, y_train, y_test):
return X_train[:y_train.shape[0]], X_test[:y_test.shape[0]], y_train, y_test
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
def to_binary(n):
if n < 2:
return str(n)
else:
return to_binary(n // 2) + str(n % 2)
import scipy.stats as stats
def mann_whitney_test(data1, data2, alpha=0.05):
stat, p = stats.mannwhitneyu(data1, data2)
print('Statistics=%.3f, p=%.3f' % (stat, p))
if p > alpha:
print('Same distribution (fail to reject H0)')
else:
print('Different distribution (reject H0)')
mann_whitney_test(data[data['version'] == 'gate_30']['sum_gamerounds'], data[data['version'] == 'gate_40']['sum_gamerounds'])
import pandas as pd
df_ru = pd.read_csv('../data/ft_ru.csv', parse_dates=[0], index_col=[0], dayfirst=True)
def hot_years(df):
df['year'] = df.index.year
hot_years = df.resample('A').mean()
hot_years = hot_years[hot_years['av_temp'] > 15]
return hot_years
hot_years(df_ru)
from sklearn.preprocessing import StandardScaler
def scale(sl, sw):
scaler = StandardScaler()
return scaler.fit_transform(sl, sw)
scale(sl, sw)
plt.xticks(rotation = 0)
plt.xlabel('nutrient')
plt.ylabel('%')
plt.title('nutrient vs mineral')
plt.grid(axis = 'y')
plt.legend(loc = 'best')
import matplotlib.pyplot as plt
import numpy as np
def plot(A, B):
plt.boxplot([A, B])
plt.show()
plt.hist([A, B])
plt.show()
A = [1, 2, 3]
B = [4, 3, 2]
plot(A, B)
import matplotlib.pyplot as plt
plt.bar(df.index, df[col_name])
plt.show()
def replace(stnum, students, replacing_num):
result = students.copy()
for elem in students:
if elem[0] == stnum:
for i in range(1, len(elem)):
elem[i] = replacing_num
return result
stnums = ['4004']
students = [
['0001', 'Antonov', 'Anton', 'Igorevich', '20.08.2009', 'BST161'],
["1102", "Bogov", "Artem", "Igorevich", "25.01.2010", "BST162"]
["0333", "Glagoleva", "Anastasiya", "Nikolaevna", "11.07.2009", "BST163"]
["4004", "Stepanova", "Natalia", "Aleksandrovna", "13.02.2008", "BST161"]
["0045", "Bokov", "Igor", "Kharitonovich", "02.06.2009", "BST161
def avgrating(df):
#your code here
return df
def same_case(a, b):
if a.isalpha() and b.isalpha():
if a.islower() and b.islower():
return 1
elif a.isupper() and b.isupper():
return 1
else:
return 0
else:
return -1
same_case('a', 'g') # 1
same_case('A', 'C') # 1
same_case('b', 'G') # 0
same_case('B', 'g') # 0
same_case('0', '?') # -1
def hot_years(df):
hot_years=df.groupby('year').mean()
hot_years=hot_years.loc[hot_years.av_temp>15]
return hot_years
hot_years(df_ru)
def duplicates(df):
return df[df.name.duplicated(keep=False)].sort_values('name')
duplicates(df)
def search_goods():
global directories
input_number = input('Input directory number: ')
for key, value in directories.items():
for item in value:
if input_number == item:
return key
return 'Nothing found'
import pandas as pd
import numpy as np
r_cols = ['userId','movieId','rating','timestamp']
ratings = pd.read_csv('ml-latest-small/ratings.csv',usecols=r_cols)
#your code here
ratings
def convert_to_year(x):
try:
date = pd.to_datetime(x)
return date.year
except:
return None
df['year'] = df.date.apply(convert_to_year)
def to_binary(n):
#your code here
return bin(n)
def count_list(list):
count_list = {}
for i in list:
if i in count_list:
count_list[i] += 1
else:
count_list[i] = 1
return count_list
count_list(a)
def tokenize(text):
return tokens_re.findall(text)
def preprocess(text):
tokens = tokenize(text)
tokens = [token for token in tokens if not token in stop_words]
tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
return tokens
from scipy.stats import mannwhitneyu
def wilcoxon(a, b):
stat, p = mannwhitneyu(data[data['version'] == a]['sum_gamerounds'], data[data['version'] == b]['sum_gamerounds'])
print('Mann-Whitney Statistics=%.3f, p=%.3f' % (stat, p))
wilcoxon('gate_30','gate_40')
def transpose(matrix):
'''
Transpose a matrix
'''
matrix_t = []
for i in range(len(matrix[0])):
row = []
for j in range(len(matrix)):
row.append(matrix[j][i])
matrix_t.append(row)
return matrix_t
def change_quantity(ingridient, portions):
return ingridient['quantity'] * portions
change_quantity(ingridient, portions)
import plotly.express as px
def horizontal_bar(question, title):
question = df[question].value_counts()
label = question.index
counts = question.values
fig = px.bar(x=label, y=counts, orientation='h')
fig.update_layout(title_text=title)
fig.show()
horizontal_bar('What is the most preferred working environment for you?', 'Какая рабочая среда для вас наиболее предпочтительна?')
data = data.sort_values(by = ['max'], ascending = False).head(20)
data
import pandas as pd
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,encoding='latin-1')
ratings = ratings.drop('unix_timestamp', axis=1)
ratings.head()
import re
def rate_group(int_rate):
int_rate = re.sub('%', '', int_rate)
int_rate = float(int_rate)
if int_rate > 15.0:
return '>15'
elif 10.0 < int_rate <= 15.0:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
def change_shelf(data):
docnum = '11-2'
shelf = '3'
if list(docnum) in data.values():
print('Error')
for key, val in data.items():
for doc in val:
if doc == docnum:
data[shelf].append(doc)
data[key].remove(doc)
print('OK')
return data
from scipy import stats
from math import sqrt
def interval(n, mean, sig, conf):
h = sig * stats.norm.ppf((1 + conf) / 2) / sqrt(n)
return int(2 * h)
def sum_list(lst):
sum = 0
for i in range(len(lst)):
for j in range(len(lst)):
if i != j:
sum += lst[i] + lst[j]
return sum
sum_list([2, 1, 10, 5])
df[df['loan_amnt'].str.contains('[a-z]', flags=re.IGNORECASE, regex=True)]
df[['performer', 'time_on_chart']].groupby('performer').agg({'min', 'max'}).sort_values('max', ascending = False)
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def distance(a, b):
return math.sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
df.plot(kind='box', subplots=True, layout=(4,2), sharex=False, sharey=False)
plt.rcParams.update({'font.size': 14})
plt.rcParams["axes.linewidth"] = 3
plt.show()
import matplotlib.pyplot as plt
def linegraph(df):
plt.plot(df.index, df['2015'], label = "2015")
plt.plot(df.index, df['2016'], label = "2016")
plt.plot(df.index, df['2017'], label = "2017")
plt.plot(df.index, df['2018'], label = "2018")
plt.plot(df.index, df['2019'], label = "2019")
plt.legend()
plt.show()
linegraph(df)
from sklearn.metrics import f1_score
y_pred = lda_model.predict(X_val)
f1_score(y_val, y_pred)
import pandas as pd
import datetime
import numpy as np
df_ratings = pd.read_csv('C:/Users/User/Downloads/ml-latest-small/ratings.csv')
def diff_pd(x):
return x.max() - x.min()
df_ratings.groupby('userId')['timestamp'].agg([diff_pd]).mean()
def rock(dataframe):
for i in range(0, len(dataframe)):
if(dataframe['Class 1'][i] == 'Rock' or dataframe['Class 2'][i] == 'Rock'):
dataframe.drop(i, inplace = True)
return dataframe
rock(df)
def count5_decade(df):
df = df[df.rating == 5.0]
return df.decade.value_counts()
count5_decade(ratings)
import pandas as pd
from scipy import stats
df1 = pd.read_csv(url, sep='\s+', skiprows=3, nrows=4)
df2 = pd.read_csv(url, sep='\s+', skiprows=7, nrows=4)
df1.columns = ['A', 'B', 'C']
df2.columns = ['A', 'B', 'C']
print(stats.f_oneway(df1['A'], df2['A']))
print(stats.f_oneway(df1['B'], df2['B']))
print(stats.f_oneway(df1['C'], df2['C']))
def move(directories, str1, str2):
# write your code here
dic = directories
if str1 not in dic:
return 'ERROR NO SUCH KEY'
if str2 not in dic:
dic[str2] = []
if str1 in dic:
if str2 not in dic[str1]:
return 'ERROR NO SUCH VALUE'
dic[str2].append(str2)
for i in range(len(dic[str1])):
if str2 in dic[str1][i]:
dic[str1].remove(str2)
return dic
print(move(directories, '11-2', '3'))
import pandas as pd
df = pd.read_csv('data/charts.csv')
dfs = df.copy()
def chart_debut_format(chart_debut):
return chart_debut[:4]
dfs['chart_debut'] = dfs['chart_debut'].apply(chart_debut_format)
dfs.head()
def multiple_of_index(arr):
return [num for i, num in enumerate(arr) if num % i == 0 and i != 0]
multiple_of_index([22, -6, 32, 82, 9, 25])
def merge_df(df1, df2):
# Combine the dataframes on client_id
return df1.merge(df2, on='client_id')
merge_df(rzd, auto)
df_new.id = df_new.id.astype(int)
def date_range(start_date, end_date):
lst = []
if start_date > end_date:
return []
else:
try:
start = (dt.strptime(start_date, '%Y-%m-%d'))
end = (dt.strptime(end_date, '%Y-%m-%d'))
lst.append(start.strftime('%Y-%m-%d'))
start += td(days=1)
return lst
except:
return []
import math
class Point(object):
def __init__(self, x=0, y=0):
self.x = x
self.y = y
def distance(self, other):
return math.sqrt(math.pow((other.x - self.x),2) + math.pow((other.y - self.y),2))
installs.plot(kind='bar',
title='Топ 10 издателей по продажам видеоигр',
xlabel='Издатель',
ylabel='Количество игр',
logy=True)
def count(df):
return df[df == '?'].count()
def sum_all(arr):
return sum([x+y for x in arr for y in arr])
sum_all([2, 1, 10, 5])
def extract_year(title):
return title.split('(')[1].split(')')[0]
df['year'] = df['title'].apply(extract_year)
def make_list(direct):
return ','.join(list(direct.keys()))
make_list(directories)
def is_acceptable_password(password: str) -> bool:
return len(password) > 6 and any(i.isdigit() for i in password) and not any(j.isdigit() for j in password[-9:])
import numpy as np
A = np.array([6, 8, 8, 10, 12, 12, 12, 12, 12, 14, 14, 14, 14, 16, 16, 16, 18, 18, 18, 18, 22])
B = np.array([10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20])
def plot_two_boxplots(speed, boosted_speed):
import numpy as np
import matplotlib.pyplot as plt
plt.boxplot([speed, boosted_speed])
plt.show()
def sales_df(regions, na_sales, eu_sales, jp_sales, oth_sales):
return df
sales_df(regions, na_sales, eu_sales, jp_sales, oth_sales)
from scipy import stats
stats.f_oneway(p['Speed'], boosted_p['Speed'])
def add_columns(a, b, c):
df['A'] = a
df['B'] = b
df['C'] = c
add_columns(water, nutri, mineral)
df.groupby('userId').filter(lambda x: len(x) >= 100)
df.groupby('userId').mean()
# import pandas
import pandas as pd
# import the data
data = pd.read_csv('https://s3.amazonaws.com/assets.datacamp.com/production/course_2023/datasets/imdb_1000.csv')
# check the data
data
# check the shape of the data
data.shape
# drop the rows with null values
data.dropna(inplace=True)
# drop the rows with null values
data.dropna(inplace=True)
# check the shape of the data
data.shape
# find the best decade
data.groupby('decade')['rating'].mean().sort_values(ascending=False)
import math
def sample_size(delta, sigma, conf):
z = 0.5 * (1 + conf)
return math.ceil(2 * (z ** 2) * (sigma ** 2) / (delta ** 2))
sample_size(0.1, 0.3, 0.95)
def decade(x):
if x.isdigit():
decade = int(x)
decade = decade/10
decade = int(decade)
decade = decade*10
return str(decade) + "-" + str(decade+10)
else:
return float('NaN')
df['Decade of Release'] = df['Year'].apply(decade)
df
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, cats, test_size=0.3, random_state=42)
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
year_leaders(dfp)
def object_finder(dataframe, column):
return dataframe[dataframe[column].apply(lambda x: type(x) == object)]
object_finder(data, 'values')
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=2)
fig.add_trace(px.bar(x=label, y=counts, orientation='v'), 1, 1)
fig.add_trace(px.bar(x=label, y=counts, orientation='v'), 1, 2)
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?',
showlegend=False)
fig.show()
def year_leaders(df):
df2 = df[["performer", "hits"]].groupby(df['chart_debut']).max()
return df2
def magic(arr):
result = 0
for i in range(len(arr)):
for j in range(i+1, len(arr)):
result = result + (arr[i] + arr[j])
return result
magic(arr)
def add(a, b):
return a - b
add(1, 2)
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
year_leaders(dfp)
def calculate_respiratory_rate(pulse, respiratory_rate):
if respiratory_rate == respiratory_rate:
return respiratory_rate
else:
return pulse/pulse + 0.5
calculate_respiratory_rate(60, np.nan)
calculate_respiratory_rate(60, 12)
# Update x axis
fig.update_xaxes(
title_text="Количество ответов",
tickvals=counts,
ticktext=label
)
# Set y-axis title
fig.update_yaxes(title_text="Ваш выбор")
fig.show()
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
print(dfp_copy)
return dfp_copy
data1 = df['Critic_Score']
data1.to_numpy()
def replace_numbers(stnum, stus, repnum):
for num, stu in enumerate(stus):
if stu[0] == stnum:
stu[0] = repnum
return stus
replace_numbers('4004', students, '9090')
def check_password(string):
if "password" in string:
return True
else:
return False
check_password("asdfasdfpasswordasdfasdf")
def anova(data):
fvalue, pvalue = stats.f_oneway(p.Speed, boosted_p.Speed)
return (fvalue, pvalue)
def checkio(delta, sigsqr, conf):
# Your code here
# It's main function. Don't remove this function
# It's using for auto-testing and must return a result for check.
import scipy.stats as st
import numpy as np
n = ((st.norm.ppf((1+conf)/2))**2*sigsqr)/(delta**2)
return int(n)
# Some hints
# You can use stats.norm.ppf function for the normal distribution
#These "asserts" using only for self-checking and not necessary for auto-testing
if __name__ == '__main__':
assert checkio(0.02, 0.04, 0.95) == 491
assert checkio(0.02, 0.005, 0.95) == 4127
print("Coding complete? Click 'Check' to earn cool rewards!")
def year_leaders(df):
pass
def total_ingridients():
dish = 'salad'
portions = 5
cook_book = {'salad': [{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'g'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pcs'},
{'ingridient_name': 'cucumbers', 'quantity': 100, 'measure': 'g'}]}
print(f'{dish}')
for ing in cook_book[dish]:
print(f'{ing["ingridient_name"]}: {ing["quantity"]*portions}{ing["measure"]}')
def group_by_two_columns(dataframe, col1, col2, col3):
return df.groupby(['col1','col2'])['col3'].value_counts()
stats.f_oneway(*(df[col] for col in df.columns))
def year_leaders(df):
return df.groupby(['chart_debut'])['num_of_hits'].max()
def is_acceptable_password(password: str) -> bool:
if len(password) > 6:
if len(password) > 9:
return True
if password.isdigit():
return False
else:
return True
else:
return False
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
fig = plt.figure()
ax = fig.add_subplot(111,aspect='equal')
vec_1 = np.array([1,1])
vec_2 = np.array([1,0])
x_coords = [0, vec_1[0], vec_2[0]]
y_coords = [0, vec_1[1], vec_2[1]]
plt.plot(x_coords, y_coords, color='black')
plt.arrow(0, 0, vec_1[0], vec_1[1], head_width=0.1, head_length=0.1, color='black')
plt.arrow(0, 0, vec_2[0], vec_2[1], head_width=0.1, head_length=0.1, color='black')
angle = np.arccos(np.dot(vec_1,vec_2)
import numpy as np
def sum_matrix(N):
my_matrix = np.diag(np.arange(N-1, -1, -1), k=0)
return sum(my_matrix.diagonal())
print(sum_matrix(5))
print(sum_matrix(10))
print(sum_matrix(15))
def highlight(df):
return df.style.highlight_max(axis=1,
props='color:white;\
font-weight:bold;\
background-color:green;')
return df.style.highlight_min(axis=1,
props='color:white;\
font-weight:bold;\
background-color:brown;')
#your code here
def new_doc_add():
docnum = input('Enter the number of your document: ')
doctype = input('Enter the type of your document: ')
docowner = input('Enter the owner of your document: ')
shelf = input('Enter the shelf number: ')
documents.append({'type': doctype, 'number': docnum, 'name': docowner})
if shelf in directories:
directories[shelf].append(docnum)
else:
directories[shelf] = [docnum]
song performer chart_debut peak_position worst_position time_on_chart consecutive_weeks hits
Stupid Cupid Connie Francis 1958-08-02 17 72 12 11.0 Stupid Cupid
Chantilly Lace Big Bopper 1958-08-02 6 40 18 17.0 Chantilly Lace
Chantilly Lace Big Bopper 1958-08-02 6 40 19 18.0 Chantilly Lace
Chantilly Lace Big Bopper 1958-08-02 6 40 20 19.0 Chantilly Lace
Chantilly Lace Big Bopper 1958-08-02 6 40 21 20.0 Chantilly Lace
def mean(numbers):
total_sum = 0;
for n in numbers:
total_sum += n
count = len(numbers)
avg = total_sum / count
return avg
def median(numbers):
numbers.sort() #sort the list
count = len(numbers) #get the length of the list
isEven = count % 2 == 0 #check if this list is of even length
if (isEven):
#find the two numbers in the middle of the list
mid = math.floor( count / 2 )
a = numbers[mid - 1]
b = numbers[mid]
#find the average of these two numbers
ans = mean([a, b])
else:
ans = numbers[math.floor( count / 2 )]
return ans
import numpy as np
A1 = np.array([[0, 0, 0, 0, 1, 0],
[0, 1, 0, 1, 0, 0],
[0, 0, 1, 1, 1, 0],
[1, 0, 0, 1, 0, 1]])
vector = [1,2,3,4]
A1.dot(vector)
def create_sales_df(df):
return pd.DataFrame({'regions': ['North America', 'Europe', 'Japan', 'Other'],
'sales': [df['NA_Sales'].sum(), df['EU_Sales'].sum(), df['JP_Sales'].sum(), df['Other_Sales'].sum()]})
create_sales_df(df)
def delete_row_with_rock(df):
# your code here
import re
def remove_quotes(d):
for k, v in d.items():
k = re.sub(r" \"", "", k)
v = re.sub(r" \"", "", v)
d[k] = v
return d
remove_quotes({' "user_id"': ' "category"}', ' "1840e0b9d4"': ' "Products"}'})
from datetime import date, timedelta as td
def date_range(start_date, end_date):
lst = []
if start_date > end_date:
return []
else:
try:
start = (dt.strptime(start_date, '%Y-%m-%d'))
end = (dt.strptime(end_date, '%Y-%m-%d'))
while start <= end:
lst.append(start.strftime('%Y-%m-%d'))
start += td(days=1)
return lst
except:
return []
def millionaire(df):
rich = df[df.annual_inc >= 1000000]
return rich
millionaire(df)
def divide_hotels(df):
df['big_hotels'] = df.apply(lambda x: x['profit'] if x['total_rooms'] > 30 else 0, axis=1)
df['medium_hotels'] = df.apply(lambda x: x['profit'] if x['total_rooms'] <= 30 and x['total_rooms'] > 20 else 0, axis=1)
df['small_hotels'] = df.apply(lambda x: x['profit'] if x['total_rooms'] <= 20 and x['total_rooms'] > 10 else 0, axis=1)
return df
df = divide_hotels(df)
df.head()
geo_data = {'Center': ['Moscow', 'Tula', 'Yaroslavl'], 'Northwest': ['Petersburg', 'Pskov', 'Murmansk'], 'Far East': ['Vladivostok', 'Sakhalin', 'Khabarovsk']}
def geo_class(city):
for region in geo_data:
if city in geo_data[region]:
return region
geo_class('Pskov')
def my_evaluation(x_train, y_train, x_test, y_test):
# your code
return f1, precision, recall
# your code here
def count_sources_per_region(df):
return df.groupby(['region'])['traffic_source'].count()
df2['respiratory_rate'] = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform('median'))
df2.groupby('pulse')['respiratory_rate'] = round(df2.groupby('pulse')['respiratory_rate'])
def replace_num(stnums, students, replacing_num):
for num in stnums:
for st in students:
if st[0] == num:
st[1] = replacing_num
return students
replace_num(stnums, students, replacing_num)
def glue(x):
str_ = ''
for i in x:
str_+=str(i)
return str_
glue(lst)
def transpose_matrix(A):
rows = len(A)
cols = len(A[0])
B = [[0 for i in range(0, rows)] for j in range(0, cols)]
for i in range(0, rows):
for j in range(0, cols):
B[j][i] = A[i][j]
return B
plt.plot(x, y, 'r--')
def cond_to_float(X_train, X_test, y_train, y_test):
y_train = y_train.astype(np.float)
y_test = y_test.astype(np.float)
X_train = X_train.astype(np.float)
X_test = X_test.astype(np.float)
return (X_train, X_test, y_train, y_test)
def to_csv_string(array):
return '\n'.join(','.join(map(str, x)) for x in array)
class Designer(Employee):
def __init__(self, name, seniority, awards=2):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
# for each accreditation, increase the counter by 1
# for now we assume that all designers are accredited
self.seniority += 1
# condition for promoting an employee from the presentation
if (self.seniority + self.intlawards) % 7 == 0:
self.grade_up()
# publication of the results
return self.publish_grade()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
def anova(df):
cat = df.select_dtypes(include=['object']).columns
num = df.select_dtypes(include=['float64']).columns
num = num.drop('Time')
all_columns = num
f, p = stats.f_oneway(df[all_columns].values)
return f, p
def find_values(df):
column = df['values']
return column.str.contains('a')
def bar_graphs(df1, df2):
plt.figure(figsize=(15,7))
plt.subplot(2,2,1)
sns.barplot(df1['city'].value_counts().index, df1['city'].value_counts().values)
plt.xticks(rotation=40)
plt.title('City')
plt.subplot(2,2,2)
sns.barplot(df2['hotel'].value_counts().index, df2['hotel'].value_counts().values)
plt.xticks(rotation=40)
plt.title('Hotel')
plt.subplot(2,2,3)
sns.barplot(df2['date'].value_counts().index, df2['date'].value_counts().values)
plt.xticks(rotation=40)
plt.title('Date')
plt
from scipy.stats import wilcoxon
stat, p = wilcoxon(data[data['version'] == 'gate_30']['sum_gamerounds'], data[data['version'] == 'gate_40']['sum_gamerounds'])
print('Mann-Whitney Statistics=%.3f, p=%.3f' % (stat, p))
def round_to_four(a, b):
return round(a + b)
round_to_four(1.2, 3.1)
def year_leaders(df):
df=df.groupby(df.chart_debut).apply(lambda x: x.sort_values('num_of_hits',ascending=False)).reset_index(drop=True)
df=df.groupby('chart_debut').head(1)
return df
year_leaders(df)
# Добавляем колонку для проверки
df['duplicate_name'] = df['name'].duplicated()
# Проверяем колонку и выводим на экран
df [df['duplicate_name'] == True]
def split_dataframe(df):
X = df['Message']
y = df['Category']
return X, y
split_dataframe(df)
def group_by_title(df):
return df.groupby('title')['rating'].mean()
def powers_of_two(n):
return [2**x for x in range(n+1)]
def change_col_type(df, col_type):
try:
return df.astype(col_type)
except:
pass
df2 = change_col_type(df, 'float')
def geo_class(row):
geo_data = {'center': ['Moscow', 'Tula', 'Yaroslavl'],
'Northwest': ['petersburg', 'pskov', 'murmansk'],
'Far East': ['vladivostok', 'sakhalin', 'khabarovsk']}
for key in geo_data.keys():
for value in geo_data.values():
if value in row:
return key
return 'undefined'
df4['loyal_profit'] = df4.apply(lambda x: (x.profit / x.ocup_rooms), axis=1)
df = pd.DataFrame({'grade': ['A','B','C','D','E','F'], 'rate_group': ['7-8%', '10-11%', '12-13%', '15-17%', '17-25%', '17-25%', '17-25%'], 'id': [1077501, 1077430, 1077175, 1076863, 1075358, 1075269, 1069639, 1072053, 1071795, 1071570]})
df.set_index(['id', 'rate_group'], inplace=True)
df.unstack(level='rate_group')
def get_quantity(cook_book, key, portions):
grams = 0
if key == 'salad':
grams = cook_book[key][0]['quantity'] * portions
print(f"{cook_book[key][0]['ingridient_name']}: {grams} {cook_book[key][0]['measure']}")
grams = cook_book[key][1]['quantity'] * portions
print(f"{cook_book[key][1]['ingridient_name']}: {grams} {cook_book[key][1]['measure']}")
grams = cook_book[key][2]['quantity'] * portions
print(f"{cook_book[key][2]['ingridient_name']}: {grams} {cook_book[key][2]['measure']}")
import pandas as pd
df = pd.DataFrame({
'user_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
'name': ['Ksenia Rodionova', 'Ulyana Selezneva', 'Konstantin Prokhorov', 'Petrov Vladimir', 'Arina Selivanova', 'Svetlana Kuznecova', 'Evgeniy Laptev', 'Ivan Ryzhkov', 'Sidorov Nikolay', 'Nikolay Ivanov', 'Natalya Volkova', 'Maksim Petrov', 'Maksim Petrov', 'Viktor Fomichev', 'Ulyana Selezneva'],
'date': ['2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-01', '2021-07-02', '2021-07-02', '2021-07-02', '
def round_up(x):
if x - math.floor(x) < 0.5:
return math.floor(x)
return math.ceil(x)
df2['respiratory_rate'] = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform(round_up))
def get_year(title):
return int(title.split()[-1][1:-1])
df['year'] = df['title'].apply(get_year)
import numpy as np
tfidf = np.array(tfidf)
from gensim import similarities
cos_sim = similarities.MatrixSimilarity(tfidf[bows])
def find_difference(a, b):
return abs(reduce(lambda x, y: x*y, a) - reduce(lambda x, y: x*y, b))
import math
def nearest_sq(n):
return round(math.sqrt(n)) ** 2
def top20(df):
gdpdiff = pd.DataFrame({'Top1': df[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[0:1],
'Top20': df[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[19:20]})
return gdpdiff
top20(df19)
def data_clean(df):
df['chart_debut'] = df['chart_debut'].str.split('-').str[0]
return df
data_clean(df)
mean_c.sort_values(ascending=False)
import pandas as pd
ratings = pd.read_csv('ratings.csv')
def aver_lifetime(data):
data['date'] = pd.to_datetime(data['timestamp'], unit='s').dt.date
lifetimes = data.groupby('userId')['date'].agg(['min', 'max'])
lifetimes['lifetime'] = lifetimes['max'] - lifetimes['min']
return lifetimes['lifetime'].mean()
aver_lifetime(ratings)
def assignDecade(x):
if type(x) is str:
return np.nan
else:
return (str(int(x[:3]))+"0-") + (str(int(x[:3])+1)+"0")
df['Decade of Release'] = df['Year'].map(assignDecade)
df
def round_nearest_int(x):
return round(x)
def sample_size(error, sigsqr, conf):
return (error**2)*sigsqr/(1.96**2)
def highlight_min_max(df1):
return df1.style.highlight_max(axis=1,
props='color:white;\
font-weight:bold;\
background-color:green;').apply(highlight_min, axis=1,
props='color:white;\
font-weight:bold;\
background-color:brown;')
highlight_min_max(df2)
def replace_nums(students, stnums, replacing_num):
for st in students:
for stnum in stnums:
if stnum in st:
st.remove(stnum)
st.append(replacing_num)
return students
print(replace_nums(students, stnums, replacing_num))
def f(dframe):
years = dframe.columns.get_level_values(0).get_level_values(0).unique()
fig = plt.figure()
ax = fig.add_subplot(111)
for col in dframe.columns:
dframe[col].plot(kind='line', ax=ax)
plt.xticks(years)
plt.show()
f(df)
import pandas as pd
# Create DataFrame
df = pd.DataFrame({'spi_rank' : [21,19,20], 'country': ['Australia', 'Canada', 'Chile']})
def function(df):
del df['spi_rank']
del df['country']
return df
function(df)
regions = ['North America', 'Europe', 'Japan', 'Other']
sales = [4402.62, 2424.67, 1297.43, 791.34]
plt.pie(sales, labels=regions, autopct='%1.1f%%')
plt.show()
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def load_iris():
iris = datasets.load_iris()
return iris
iris = load_iris()
#create a list
arr = [2, 1, 10, 5]
def sum_from_list(arr):
result = []
for idx, num in enumerate(arr):
for idx_2, num_2 in enumerate(arr):
if idx_2 > idx:
result.append(num + num_2)
return result
sum_from_list(arr)
def interval(n, mean, sig, conf):
h = sig * stats.norm.ppf((1 + conf) / 2) / (n ** 0.5)
return int(round(h * 2))
import pandas as pd
import numpy as np
df = pd.DataFrame({'rate_group': ['new', 'new%', 'old', 'old%', 'new%']})
print(df)
def change_dtype(df):
for col in df.columns:
try:
df[col] = df[col].astype(float)
except:
pass
return df
df = change_dtype(df)
import matplotlib.pyplot as plt
df.plot(x='year')
plt.show()
df = pd.DataFrame([['a', 1, 2], ['a', 2, 3], ['b', 2, 3]], columns = ['performer', 'min', 'max'])
def sort_by_max(df, groupby_column, column_to_sort):
return df[[groupby_column, column_to_sort]].groupby(groupby_column).agg({'min', 'max'}).sort_values(by=column_to_sort)
sort_by_max(df, 'performer', 'max')
def get_shop_list_by_dishes(dishes, person_count):
cook_book = {
'salad': [
{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'gr'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pct'},
{'ingridient_name': 'pepper', 'quantity': 20, 'measure': 'gr'}],
'cucumbers': [
{'ingridient_name': 'cucumbers', 'quantity': 20, 'measure': 'gr'},
{'ingridient_name': 'pepper', 'quantity': 8, 'measure': 'gr'},
{'ingridient_name': 'olives', 'quantity': 8, 'measure': 'gr'},
{'ingridient_name': 'olive oil', 'quantity': 30, 'measure': 'ml'}],
'olives': [
def av_revenue(df):
df['av_revenue'] = df.groupby('hotel')['revenue'].transform('mean')
return df
def difference(df):
df['difference'] = df['av_revenue'] - df['revenue']
return df
def in_percent(df):
df['in_percent'] = 100 * df['difference'] / df['av_revenue']
return df
import pandas as pd
def decade(year):
if year < 1910:
return str(year - year%10) + '-' + str(year - year%10 + 10)
elif year > 2009:
return str(year - year%10) + '-' + str(year - year%10 + 10)
else:
return str(year - year%10) + '-' + str(year - year%10 + 9)
df['Decade of Release'] = df['Year'].map(decade)
import pandas as pd
list = [['/world/'],
['/latest/'],
['/?updated=top'],
['/politics/36188461-s-marta-zhizn-rossiyan-suschestvenno-izmenitsya-iz-za-novyh-zakonov/']
['/world/36007585-tramp-pridumal-kak-reshit-ukrainskiy-vopros/'],
['/science/36157853-nasa-sobiraet-ekstrennuyu-press-konferentsiyu-na-temu-vnezemnoy-zhizni/'],
['/video/36001498-poyavilis-pervye-podrobnosti-gibeli-natali-melamed/'],
['/world/36007585-tramp-pridumal-kak-reshit-ukrainskiy-vopros/?smi2=1']
['/science/
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
year_leaders(dfp)
plt.title('Percentage difference between nutrient and mineral content for each vegitable')
plt.xlabel('Vegetable')
plt.ylabel('Percentage difference')
plt.legend()
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, cats, test_size=0.3, random_state=42)
def rate_group(rate):
if rate > 15.00:
return '>15'
elif rate <= 15.00 and rate > 10.00:
return '10-15'
def search(query):
query = query.lower()
site = pd.DataFrame()
for page in range(0, 10):
if page == 0:
url = 'https://habr.com/ru/all/'
else:
url = 'https://habr.com/ru/all/page' + str(page) + '/'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
articles = soup.find_all('li', class_='content-list__item_post')
site = pd.concat([site, get_needed_posts(query, articles)])
return site.reset_index(drop=True)
def get_needed_posts(query, articles):
site = pd.DataFrame()
for article in articles:
title, date, link = get_article_info(article)
if not title in list(site['title']) and not link in list(site['link']):
row = {'
import pandas as pd
ratings = pd.read_csv('ratings.csv')
filtered_ratings = ratings[['title', 'rating']]
filtered_ratings = filtered_ratings.groupby('title').mean()
filtered_ratings
def mean(x):
return sum(x) / len(x)
import pandas as pd
df = pd.read_csv('ratings.csv')
df.groupby('userId').size().to_frame('size').reset_index()
import numpy as np
def sum_matrix(N):
my_matrix = np.diag(np.arange(N-1, -1, -1), k=0)
return np.trace(my_matrix)
print(sum_matrix(5))
print(sum_matrix(10))
print(sum_matrix(15))
df = df[df.duplicated(subset=["name"], keep=False)].sort_values("name")
import numpy as np
def stdev(A):
return np.std(A)
B = [1, 2, 3, 4, 5]
print(stdev(B))
def create_plot(question, title, figure_template):
# question = 'What is the most preferred working environment for you.'
question6 = df[question].value_counts()
label = question6.index
counts = question6.values
fig = figure_template(x=label, y=counts)
fig.update_layout(title_text=title)
fig.show()
create_plot('What is the most preferred working environment for you.', 'Какая рабочая среда для вас наиболее предпочтительна?', px.bar)
def top(df, column_name):
return df.sort_values(by=column_name, ascending=False)[:20]
top(df, 'av_temp')
def is_month_end(date):
# Your code goes here.
is_month_end(date)
import pandas as pd
df = pd.read_csv('news.csv', delimiter='\t')
def filter_news(news_title):
if news_title.startswith('/') and news_title.count('/')==2 and news_title[-1].isdigit() and not news_title.endswith('/'):
return True
else:
return False
df.news_title.apply(filter_news)
# function to create a new data frame, water, mineral, nutri and then concatenate them to form a new df
def df_maker(df, name):
df['treatments'] = name
df = df.reset_index()
df = df[['index', 'treatments', 'value']]
return df
# create water df
water = df_maker(water, 'water')
# create nutri df
nutri = df_maker(nutri, 'nutri')
# create mineral df
mineral = df_maker(mineral, 'mineral')
# combine df
combined_df = pd.concat([water, nutri, mineral])
combined_df
def panda_function(df):
df['loyal_profit'] = df.apply(lambda x: (x.profit / x.ocup_rooms) if ['regular_customer', 'by_recommendation'] in x.how_find_us else None, axis=1)
question6 = 'What is the most preferred working environment for you.'
question6 = df[question6].value_counts()
label = question6.index
counts = question6.values
fig = px.bar(x=label, y=counts)
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
def find_us(df):
df.loc[df.hotel=='Alpina', 'how_find_us'].value_counts()
df.loc[df.hotel=='Alpina', 'how_find_us'] = df.loc[df.hotel=='Alpina', 'how_find_us'].map(lambda x: 'aggregators' if 'agg' in x else x)
df.loc[df.hotel=='Alpina', 'how_find_us'] = df.loc[df.hotel=='Alpina', 'how_find_us'].map(lambda x: 'social' if 'facebook' in x or 'vk' in x or 'instagram' in x or 'telegram' in x else x)
return df
def func(x, pos): # formatter function takes tick label and tick position
s = '{:0,d}'.format(int(x))
return s
import matplotlib.ticker as ticker
formatter = ticker.FuncFormatter(func) # make formatter
plt.gca().yaxis.set_major_formatter(formatter) # set formatter to needed axis
df_new.groupby(['route', 'incident_type']).count()['id']
df_new.groupby(['route', 'incident_type'])['id'].count()
df_new.groupby(['route', 'incident_type'])['id'].size()
df_new[df_new['incident_type'] >= 5].groupby(['route', 'incident_type']).count()['id']
df_new[df_new['incident_type'] >= 5].groupby(['route', 'incident_type']).size()
df_new[df_new['incident_type'] >= 5].groupby(['route', 'incident_type'])['id'].size()
df_new[df_new['incident_type'] >= 5].groupby(['route', 'incident_type'])['id'].size().sort_values(ascending=False)
def decade_of_release(year):
if type(year) == int:
if year < 1900:
return "1800-1900"
if year >= 1900 and year < 1910:
return "1900-1910"
if year >= 1910 and year < 1920:
return "1910-1920"
if year >= 1920 and year < 1930:
return "1920-1930"
if year >= 1930 and year < 1940:
return "1930-1940"
if year >= 1940 and year < 1950:
return "1940-1950"
if year >= 1950 and year < 1960:
return "1950-1960"
if year >= 1960 and year < 1970:
return "1960-1970"
if year >= 1970 and year < 1980:
return "1970-1980"
if year >= 1980 and year < 1990:
return "1980-1990"
if year >= 1990 and year < 2000:
return "1990-2000"
if year >= 2000 and year < 2010:
return "2000-2010"
import matplotlib.pyplot as plt
df.plot()
plt.show()
df=pd.DataFrame({'userId':[1,1,1,1,1],
'movieId':[31,1029,1061,1129,1172],
'rating':[2.5,3,3,2,4],
'timestamp':[1260759144,1260759179,1260759182,1260759185,1260759205]
})
df
def matrix_multiplication(matrix,vector):
matrix_dot_vector = []
for i in range(len(matrix)):
sum_of_row = 0
for j in range(len(matrix[0])):
sum_of_row += matrix[i][j] * vector[j]
matrix_dot_vector.append(sum_of_row)
return matrix_dot_vector
vector = [1,2,3,4]
A1 = array([[0, 0, 0, 0, 1, 0],
[0, 1, 0, 1, 0, 0],
[0, 0, 1, 1, 1, 0],
[1, 0, 0, 1, 0, 1]])
matrix_multiplication(A1,vector)
plt.plot(data.index, data['nutri'], label = 'nutri')
plt.plot(data.index, data['mineral'], label = 'mineral')
# write your code here
import matplotlib.pyplot as plt
from math import cos, sin, radians, degrees
def plot_vector(v, ax=None):
ax = ax or plt.gca()
arrowprops=dict(arrowstyle='->',
linewidth=2,
shrinkA=0, shrinkB=0)
ax.annotate('', v, v+v, arrowprops=arrowprops)
film_2 = np.array([0,1,0,1,0,0])
film_4 = np.array([1,0,0,1,0,1])
ax = plt.axes()
plot_vector(film_2, ax)
plot_vector(film_4, ax)
ax.set_xlim(0, 2)
ax.set_ylim(0, 2)
plt.show()
def t_test(A, B):
return B > A
class Managers(Employee):
def __init__(self, name, seniority, awards):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
# for each accreditation, increase the counter by 1
# for now we assume that all of the developers pass the accreditation
self.seniority += 1
# condition of promoting an employee from the presentation
if (self.seniority)+(self.intlawards*2) % 7 == 0:
self.grade_up()
# publication of the results
return self.publish_grade()
def add(a, b):
return a + b
add(1, 2)
def merge_arrays(arr1, arr2):
arr1.extend(arr2)
arr1.sort()
new_set = set(arr1)
new_list = list(new_set)
return new_list
def to_binary(n):
return bin(n).replace("0b", "")
df2['respiratory_rate'] = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform(round))
import pandas as pd
df = pd.read_csv("https://www.dropbox.com/s/jr9c7rwhi8hvuk7/performers.csv?dl=1")
df.sort_values(by=['time_on_chart','max'], ascending=False).head(20)
performer min max time_on_chart
9 "Weird Al" Yankovic 1 20 21
0 "Groove" Holmes 1 11 12
1 "Little" Jimmy Dickens 1 10 11
2 "Pookie" Hudson 1 1 2
import seaborn as sns
import matplotlib.pyplot as plt
def bar_graph(data):
plt.figure(figsize=(18, 6))
sns.barplot(data.index, data.values, alpha=0.8)
plt.title(str(data.name))
plt.ylabel('Count', fontsize=12)
plt.xlabel('Name', fontsize=12)
plt.show()
class Designer(Employee):
def __init__(self, name, seniority, awards=2):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
self.seniority += 1
self.seniority += self.intlawards
if self.seniority % 7 == 0:
self.grade_up()
return self.publish_grade()
def fill_na(df, column_name):
corr = df.corr()[column_name]
new_value = corr.mean() * df[column_name].mean()
df[column_name] = df[column_name].fillna(new_value)
return df
fill_na(df2, column_name='rectal_temp')
def df_gdp_diff(df):
df19 = df_19[df_19['Year'] == 2019]
gdpdiff = pd.DataFrame({'Top1': df19[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[0:1],
'Top20': df19[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[19:20]})
return gdpdiff
The code below gives a ValueError: Input variables with an inconsistent number of samples were found: [8082, 5572]. Fix it!
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, cats, test_size=0.3, random_state=42)
df["operator"].apply(len).mean()
def df_incident_type(df):
df = df.groupby(['route', 'operator', 'group_name', 'incident_type'])[['incident_type']].count()
df = df.sort_values('incident_type', ascending=False)
df.reset_index(inplace=True)
df = df.drop_duplicates(subset=['route'], keep='first')
return df
class Designer(Employee):
def __init__(self, name, seniority, awards=2):
super().__init__(name, seniority)
self.intlawards = awards
def check_if_it_is_time_for_upgrade(self):
# for each accreditation, increase the counter by 1
# for now we assume that all designers are accredited
self.seniority += 1
# condition for promoting an employee from the presentation
if (self.seniority + self.intlawards) % 7 == 0:
self.grade_up()
# publication of the results
return self.publish_grade()
for i in winnums:
money += 1
print(money)
sns.barplot(x = 'hotel', y ='difference', hue = 'date', data = hotels_rev)
plt.show()
def expression_matter(a, b, c):
return max([a * b * c, a * (b + c), (a + b) * c, a + b + c])
class Ball():
def __init__(self, ball_type="regular"):
self.ball_type = ball_type
def replace_in_column(df, column, old_value, new_value):
df[column].replace(old_value, new_value, inplace=True)
import pandas as pd
df = pd.DataFrame([
[1, "Ksenia Rodionova", "2021-07-01", "Alpina", 1639.000000, "by_recommendation", 48, 3.0],
[2, "Ulyana Selezneva", "2021-07-01", "AquaMania", 930.000000, "by_airbnb.com", 97, 4.0],
[3, "Konstantin Prokhorov", "2021-07-01", "Breeze", 1057.720000, "agg_trivago.com", 173, 4.0],
[4, "Petrov Vladimir", "2021-07-01", "Moreon", 1403.000000, "agg_onlinetours.ru", 229, 4.0],
[5, "Arina Selivanova", "2021-07-01", "Alpina", 1639.000000, "agg_sutochno.ru", 63, 4.0],
[6
if(data['nutri']>data['mineral']):
plt.text(data['mineral'] - (data['nutri'] - data['mineral']), data.index, '{:.2f}%'.format(data['mineral'] - data['nutri']), color='white',
ha="center", va='bottom')
else:
plt.text(data['mineral'] + (data['mineral'] - data['nutri']), data.index, '{:.2f}%'.format(data['mineral'] - data['nutri']), color='black',
ha="center", va='bottom')
plt.legend()
plt.title('Nutri vs Mineral', size = 20)
plt.xlabel('Country', size = 20)
plt.ylabel('Percentage', size = 20)
plt.show()
df4['loyal_profit'] = df4.apply(lambda x: (x.profit / x.ocup_rooms) if 'regular_customer' or 'by_recommendation' in x.how_find_us else None, axis=1)
def high_rating(df):
for i in df['rating']:
if i > 6.5:
print("The years with most high rating movies are",df['decade']) #used for
def groupby_cnt(df):
df = pd.DataFrame(df.groupby("userId")["rating"].count())
df.columns = ['ratings_cnt']
return df
def square_or_square_root(arr):
new_arr = []
for i in arr:
if int(i**(1/2)) == i**(1/2):
new_arr.append(int(i**(1/2)))
else:
new_arr.append(i**2)
return new_arr
def replace(stnums, students, replacing_num):
for index, item in enumerate(students):
if item[0] in stnums:
students[index][0] = replacing_num
return students
replace(stnums, students, replacing_num)
def delete(df):
return df.drop(df[df['Class 1'].str.contains('Rock') | df['Class 2'].str.contains('Rock')].index)
delete(grass)
def plot_barchart(df):
df = df.sort_values('perc_of_5star', ascending = False).head(10)
return df.plot.barh(x = 'decade', y = 'perc_of_5star', title = '% 5-star ratings by decade');
plot_barchart(df)
import pandas as pd
performer = ['Glee Cast', 'Taylor Swift', 'Drake', 'YoungBoy Never Broke Again', 'Aretha Franklin', 'The Beatles']
hits = ['Somebody To Love', 'Friday', 'Loser Like Me', 'Baby', 'I Want You Back', 'Kacey Talk', 'Put It On Me', 'Dirty Iyanna', 'Lil Top', 'London Boy', 'Teardrops On My Guitar', 'Fifteen', 'Summer Sixteen', 'The Language', 'Weston Road Flow', 'Sgt. Pepper\'s Lonely Hearts Club Band/With A Little Help From My Friends']
chart_debut = [2009, 2008, 2016, 2020, 1967, 1978]
time_on_chart = [290, 14299, 7449, 1012, 3490, 3548]
consecutive_weeks = [47.0, 11880.0, 6441.0, 625.0, 2921.0, 2798.0]
decade = ['2000-2010', '2000-2010', '2010-2020', '2020-2030
import pandas as pd
def group(df):
df = df.pivot_table(index='rate_group', columns='grade', values='id', aggfunc=np.sum)
return df
def delete_grass(pokemon):
for i in pokemon.index:
if 'Grass' in pokemon.loc[i, ['Class 1', 'Class 2']].values:
pokemon.drop(i, inplace=True)
return pokemon
from scipy.stats import mannwhitneyu
# ...
stat, p = mannwhitneyu(data[data['version'] == 'gate_30']['sum_gamerounds'], data[data['version'] == 'gate_40']['sum_gamerounds'])
print('Mann-Whitney Statistics=%.3f, p=%.3f' % (stat, p))
df.plot(kind='box', subplots=True, layout=(4,2), sharex=False, sharey=False, fontsize=14, linewidth=3)
plt.show()
df['occupancy_rate'] = df.ocup_rooms / df.total_rooms
def unique(data: pd.DataFrame()) -> pd.DataFrame():
pass
function that: Modify this code to make a vertical bar graph instead of a pie chart (plotly.express library)
question6 = "How likely would you work for a company whose mission is not bringing social impact ?"
question6 = data[question6].value_counts()
label = question6.index
counts = question6.values
colors = ['gold','lightgreen']
fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='How likely would you work for a company whose mission is not bringing social impact?')
fig.update_traces(hoverinfo='label+value', textinfo='percent', textfont_size=30,
marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show()
def chart_peak(df):
df.sort_values(by=['song', 'peak_position'], inplace=True)
df.drop_duplicates('song', keep='first', inplace=True)
return df
def max_key(dct):
return max(dct, key=lambda key: dct[key]['Value'])
max_key(dct)
def distinct(seq):
return list(dict.fromkeys(seq))
distinct([1, 2, 2, 3, 4, 4, 5])
def size(delta, sigsqr, conf):
return ((sigsqr * 1.645 ** 2) / (delta ** 2))
def switch_elements(arr):
return [arr[-1]] + arr[1:-1] + [arr[0]]
def quadratic(x1, x2):
return (1, -x1 - x2, x1 * x2)
cos_sim = similarities.MatrixSimilarity(tfidf[bows].values)
def group_list(dct, gr):
group_list = []
for k, v in dct.items():
if gr in v:
group_list.append(' '.join(dct[k][0:3]))
group_list.sort()
for i, n in enumerate(group_list):
print('{}. {}'.format(i+1, n))
group_list(dct, 'BST161')
# output
1. A. García de Leon
2. A. Martínez Martínez
3. A. Romero de la Fuente
4. C. Ramírez de Cartagena
5. E. González Gómez
6. F. García León
7. H. Solís Ortíz
8. J. Carlos
9. J. Fernández
10. J. Muñoz Solís
11. L. González Gómez
def highlight_min_max(df):
df.style.highlight_max(axis=1,
props='color:white;\
font-weight:bold;\
background-color:green;')
df.style.highlight_min(axis=1,
props='color:white;\
font-weight:bold;\
background-color:brown;')
return df
df3.loc[df3.how_find_us.str.contains('yandex') == True, ['how_find_us']]
def occupancy_rate(total_rooms, ocup_rooms):
return ocup_rooms / total_rooms
df['occupancy_rate'] = occupancy_rate(df['total_rooms'], df['ocup_rooms'])
def create_bar_chart(x, y, title):
fig = px.bar(x=x, y=y, orientation='h')
fig.update_layout(title_text=title)
fig.show()
def five_star_decade_value_counts(df):
df = df.loc[df.rating == 5.0]
return df.decade.value_counts()
def add(a, b):
return a + b
add(1, 2)
def interval(n, mean, sig, conf):
h = 2*sig*math.sqrt(n)*norm.ppf(conf)/math.sqrt(n)
return h
def sort_df(df):
return df.sort_values(ascending=False)
sort_df(mean_c)
def round_floats(df, col):
return df[col].apply(np.round)
import matplotlib.pyplot as plt
import pandas as pd
data = pd.read_csv('data.csv')
# Add your code below:
plt.figure(figsize=(12, 4))
plt.bar(data.index, data['nutri'], color='darkblue', label='nutri')
plt.bar(data.index, data['mineral'], color='brown', label='mineral')
plt.xticks(data.index, data['food'])
plt.xlabel('Food')
plt.ylabel('Percentage difference')
plt.title('Percentage difference of nutrients and minerals')
plt.legend()
plt.show()
def bar_x_axis(df, column_name, title):
#prepare the data
question = df[column_name].value_counts()
label = question6.index
counts = question6.values
#create the figure
fig = px.bar(x=counts, y=label, orientation='h')
fig.update_layout(title_text=title)
fig.show()
bar_x_axis(df,'What is the most preferred working environment for you.','Какая рабочая среда для вас наиболее предпочтительна?')
def type_checker(variable, type):
if type(variable) == type:
return True
else:
return False
ratings = pd.read_csv("ratings.csv")
ratings.head()
import scipy
import pandas as pd
import numpy as np
data = pd.read_csv("cookie_cats.csv")
data.head()
def mann_whitney_test(data):
"""
Returns the result of a Mann Whitney test.
"""
# Calculate statistic and p-value
stat, p = scipy.stats.mannwhitneyu(data[data['version'] == 'gate_30']['sum_gamerounds'], data[data['version'] == 'gate_40']['sum_gamerounds'])
# Print result
print('mann whitney stat=%.3f, p=%.3f' % (stat, p))
mann_whitney_test(data)
def generate_unique_id(df):
df['unique_id'] = range(1, len(df) + 1)
def is_letter_in_Series(row):
if row["Series"] != row["Series"]:
return False
if any(x.isalpha() for x in row["Series"]):
return True
return False
def boxplot_compare_distributions(array1, array2, names = ["Sample 1", "Sample 2"]):
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))
# Create an axes instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot([array1, array2])
## change outline color, fill color and linewidth of the boxes
for box in bp['boxes']:
# change outline color
box.set( color='#7570b3', linewidth=2)
# change fill color
box.set( facecolor = '#1b9e77' )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#7570b3', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps'
import pandas as pd
dfs = pd.read_csv('https://raw.githubusercontent.com/coding-blocks-archives/ML-Noida-2019-June-Two/master/datasets/hot-100.csv', parse_dates=['chart_debut'])
dfs['chart_debut'] = dfs['chart_debut'].dt.year
dfs.head(10)
from nltk.corpus import stopwords
texts = [['go', 'until', 'jurong', 'point'], ['crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']]
stopwords_set = set(stopwords.words('english'))
without_sw = [word for text in texts for word in text if word not in stopwords_set]
def join(rzd, auto):
# YOUR CODE HERE
joined = rzd.join(auto, how = 'outer')
return joined
def unique_id(df):
return [df['line_id'][i] for i in range(df.shape[0])]
unique_id(df)
def plot_num_of_hits(df):
df = df.sort_values("num_of_hits", ascending=False)
# make sure you have a matplotlib
# import matplotlib.pyplot as plt
plt.bar(df.performer, df.num_of_hits)
plt.show()
hotels_rev = df1[['date', 'hotel', 'revenue', 'av_revenue', 'difference', 'in_percent']].sort_values(by=['hotel', 'date'])
plt.bar(hotels_rev['date'], hotels_rev['av_revenue'], color='red')
plt.xlabel('Date')
plt.ylabel('Average revenues')
plt.title('Average revenues per hotel')
plt.xticks(rotation=90)
plt.show()
def move(directories, doc, shelf):
if doc in directories[shelf]:
return "ERROR VALUE ALREADY EXISTS"
elif doc not in directories:
return "ERROR NO SUCH VALUE"
elif shelf not in directories:
return "ERROR NO SUCH KEY"
else:
directories[shelf].append(doc)
for shelf in directories:
if doc in directories[shelf]:
del directories[shelf]
return directories
directories = {
'1': ['2207 876234', '11-2'],
'2': ['10006'],
'3': []
}
doc = '11-2'
shelf = '3'
move(directories, doc, shelf)
# Use index as a unique identifier
df.index
# Use a column as the unique identifier
df['year'].values
def total_ingridients(cook_book):
dish = input("Enter dish: ")
ingridients = cook_book[dish]
for i in ingridients:
for value in i.values():
print(value)
total_ingridients(cook_book)
def func_name(group_number, dct):
students = []
for key, value in dct.items():
if value[4] == group_number:
students.append(value)
students.sort(key=lambda x: x[0])
for index, student in enumerate(students):
print(index, student[0], student[1], student[2])
func_name('BST162', dct)
def merge_cols(df, col_list):
return df.groupby(col_list)[['source_type']].apply(lambda x: tuple(x)).reset_index(name='source_type')
merge_cols(df, ['traffic_source', 'region'])
def get_sample_size(z=1.96, conf=0.95, sigsqr=1, delta=0.5):
return (z*z*sigsqr)/(delta*delta)
stopwords_set = set(stopwords.words('english'))
without_sw = [[word for word in text if word not in stopwords_set] for text in only_words_text]
without_sw = [item for sublist in without_sw for item in sublist]
hotels_rev.boxplot(column='av_revenue', by='hotel')
def how_much_water(water, load, clothes):
if clothes == load:
return water
if clothes < load:
return water
return how_much_water(water, load, clothes - 1) * 1.1
# неверно
def fig(x=counts, y=label):
fig = px.bar(x=counts, y=label, orientation='h')
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
import pandas as pd
water = [1,2,3,4,2,4,2,4,5,2,3,4,2,1,3,4,3,2,5,1]
nutri = [1,2,4,6,5,6,7,5,4,5,6,7,4,3,5,5,6,5,4,3,5]
mineral =[2,1,1,3,2,4,2,4,5,4,3,2,3,2,3,1,3,4,5,1,4]
treatments = pd.DataFrame({"water": water, "nutri": nutri, "mineral": mineral}, index=range(21))
treatments.reset_index().melt(id_vars=["index"], var_name="treatments", value_name="value")
from scipy.stats import mannwhitneyu
import numpy as np
def mann_whitney_plus_means(turnstile_weather):
'''
This function will consume the turnstile_weather dataframe containing
our final turnstile weather data.
You will want to take the means and run the Mann Whitney U test on the
ENTRIESn_hourly column in the turnstile_weather dataframe.
This function should return:
1) the mean of entries with rain
2) the mean of entries without rain
3) the Mann-Whitney U statistic and p-value comparing the number of entries
with rain and the number of entries without rain
You should feel free to use scipy's Mann-Whitney implementation, and you
might also find it useful to use numpy's mean function.
Here are the functions' documentation:
http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.
def interval(n, mean, sig, conf):
h = stats.norm.interval(conf, loc=mean, scale=sig / np.sqrt(n))[1] - stats.norm.interval(conf, loc=mean, scale=sig / np.sqrt(n))[0]
return round(h)
def move(directories, v, k):
if k not in directories:
print('ERROR NO SUCH KEY')
return directories
elif v not in directories[k]:
print('ERROR NO SUCH VALUE')
return directories
else:
directories[k].append(v)
for key, value in directories.items():
if v in value:
value.remove(v)
return directories
directories = {
'1': ['2207 876234', '11-2'],
'2': ['10006'],
'3': []
}
move(directories, '11-2', '3')
def mean_median(x):
return np.mean(x), np.median(x)
mean_median(water)
misclassified = []
for index, label_predicted in enumerate(predicted):
if label_predicted != y_test[index]:
misclassified.append({'message': df.iloc[index]['Message'], 'actual': df.iloc[index]['Category'], 'predicted': label_predicted})
misclassification_df = pd.DataFrame(misclassified)
df[df[['name']].duplicated(keep=False)]
df['name'].sort_values()
def html_escape(text):
text = text.replace('<', '<')
text = text.replace('>', '>')
text = text.replace('"', '"')
text = text.replace('&', '&')
return text
def group_lst(num):
for i in dct[num]:
print(i)
return
import math
import scipy
from scipy import stats
def sample_size(delta, sigsqr, conf):
z = stats.norm.ppf(conf)
n = math.ceil((2*z*z*sigsqr)/(delta**2))
return n
sample_size(10, 100, 0.95)
fig.update_layout(title_text='Какая рабочая среда для вас наиболее предпочтительна?')
fig.show()
import math
class Point(object):
def __init__(self, x=0, y=0):
self.x = x
self.y = y
# TODO Write a function calculating distance between Point a and Point b.
def distance(a, b):
c = math.sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
return c
a = Point(1,1)
b = Point(1,2)
print(distance(a,b))
df_new['unique_id'] = pd.Series(range(1, df_new.shape[0]+1))
df_new.head()
sl = [0.067,0.067,0.067,0.067,0.067,0.067,0.067,0.067,0.067,0.067]
sw = [0.050,0.050,0.050,0.050,0.050,0.050,0.050,0.050,0.050,0.050]
scaled_data = { }
# Add code here
df = pd.DataFrame(scaled_data, columns=['sl', 'sw'])
def decade(df):
df = df[df.rating == 5.0]
return df.decade.value_counts()
from sklearn.metrics import f1_score
y_pred = lda.predict(X_test)
f1_score(y_test, y_pred)
def year_leaders(df):
return df.groupby('chart_debut')['num_of_hits'].max()
def group_movies(df):
def is_month_end(date):
if date[-2:] == '31':
return 1
else:
return 0
is_month_end(df['Date'])
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
le = LabelEncoder()
le.fit(data_class)
data_class = le.transform(data_class)
X_train, X_test, y_train, y_test = train_test_split(data, data_class, random_state=42)
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro') # average='macro'
print('F1:', f1)
#confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_
def add(a, b):
return a + b
add(1, 2)
def how_much_water(water, load, clothes):
return water * (1.1 ** (clothes - load))
how_much_water(5, 10, 14)
region direct yandex google
0 Russia 1 4 0
1 Germany 0 1 0
2 USA 0 0 1
3 Italy 0 1 0
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
from math import sqrt
def distance(a, b):
x_diff_sq = (a.x - b.x)**2
y_diff_sq = (a.y - b.y)**2
return round(sqrt(x_diff_sq + y_diff_sq), 2)
from math import sqrt
def distance(a, b):
return sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
def replace_student(lst):
result = []
for student in lst:
if student[0] == student[-2]:
student[-2] = '9090'
result.append(student)
return result
print(replace_student(lst))
df['is_loyal'] = df.duplicated(subset='name', keep=False).apply(lambda x: 'True' if x else 'False')
out = (x*5 for x in y)
y = {'Marlboro': [3, 13, 6, 66, 13, 7, 13]}
for el in y.get('Marlboro'):
el * portions
print(el)
import pandas as pd
def get_dataframe(sl, sw):
df = pd.DataFrame({
'sl': sl,
'sw': sw
})
return df
sl = [1, 2, 3, 4, 5]
sw = [1, 2, 3, 4, 5]
df = get_dataframe(sl, sw)
df
def find_index(lst, ind):
return lst[ind % len(lst)]
find_index(["a", "b", "c", "d"], 1)
df = pd.DataFrame({'chart_debut': ['2012', '2012', '2012', '2014', '2017'], 'num_of_hits': [1,2,3,4,5]})
def year_leaders(df):
return df.groupby('chart_debut').max()
year_leaders(df)
if word.find(first) + word.find(second) == -2:
s = word.find(first)
if word[s + 1] == second:
return True
else:
return False
else:
return False
goes_after("world", "o", "r")
def prepare_dish(dish, portions):
for ingridient in dish:
ingridient['quantity'] = ingridient['quantity'] * portions
return dish
prepare_dish(dish, portions)
def year_leaders_all(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
return dfp_copy
year_leaders_all(dfp)
df3[df3.how_find_us.str.contains('yandex').drop_duplicates(keep=False)
class Solution(object):
def main(self):
print("Hello World!")
Solution.main("parameter1","parameter2")
import math
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def distance(a, b):
return math.sqrt((a.x - b.x)**2 + (a.y - b.y)**2)
df.plot.bar(rot=0)
for index, label_predicted in enumerate(predicted):
if label_predicted != y_test[index]:
misclassified.append({'message': X_test[index], 'actual': y_test[index], 'predicted': label_predicted})
misclassification_df = pd.DataFrame(misclassified)
def move_doc(directories, doc, shelf):
if shelf in directories:
if doc not in directories[shelf]:
print('No such value')
else:
for value in directories.values():
if doc in value:
value.remove(doc)
else:
print('No such key')
directories[shelf].append(doc)
return directories
directories = {
'1': ['2207 876234', '11-2', '5455 028765'],
'2': ['10006', '5400 028765', '5455 002299'],
'3': []
}
doc = '11-2'
shelf = '3'
move_doc(directories, doc, shelf)
def how_much_water(L,X,N):
return L * (1+0.1)**(N-X)
how_much_water(5, 10, 14)
def _if(bool, func1, func2):
if bool:
func1()
else:
func2()
def truthy():
print("True")
def falsey():
print("False")
_if(True, truthy, falsey)
plt.xticks(ks)
misclassified = []
for index, label_predicted in enumerate(predicted):
if label_predicted != y_test[index]:
misclassified.append({'message': df.iloc[index]['message'], 'actual': y_test[index], 'predicted': label_predicted})
misclassification_df = pd.DataFrame(misclassified)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 1 16:34:47 2020
@author: jordan
"""
def ends77(x):
return x % 100 == 77
def ends7(x):
return x % 10 == 7
def ends00(x):
return x % 100 == 0
def ends0(x):
return x % 10 == 0
def dropdollar(x):
return x % 100 == 0 or x % 100 == 7
def dropdollars(x):
return x % 100 == 0 or x % 100 == 7
money = 0
trials = 100000
for i in range(trials):
num = np.random.randint(1, 1000)
if num == 777: money += 200; successes += 1
elif num == 999: money += 100; successes += 1
elif num == 555: money += 50; successes += 1
elif num == 333: money += 15;
def max_in_dictionary(d):
max_key = max(d, key=lambda key: d[key]['Value'])
return (max_key, d[max_key])
max_in_dictionary(rates)
def am_i_wilson(n):
from math import factorial
return factorial(n-1)+1 == n*n*factorial(n-2)
# Write your code here
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('./data/nutri.csv')
df.plot.bar()
plt.show()
import pandas
users = pandas.read_csv('ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
ratings = pandas.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
avg_lifetime = ratings.groupby('user_id')['timestamp'].agg(['max','min']).mean(axis=1).mean()
print(avg_lifetime)
def to_csv_string(array):
return '\n'.join([','.join([str(j) for j in i]) for i in array])
to_csv_string([[ 0, 1, 2, 3, 4 ],
[ 10,11,12,13,14 ],
[ 20,21,22,23,24 ],
[ 30,31,32,33,34 ]])
def rich_people(df, a, b):
return df[(df.annual_inc >= a) & (df.annual_inc <= b)]
rich_people(df, 100000, 1000000)
def hot_years_create(df_ru, min_temp):
df_ru_hot = df_ru.groupby(['year'])['av_temp'].mean().reset_index()
return df_ru_hot[df_ru_hot['av_temp'] > min_temp]
hot_years_create(df_ru, 15)
def how_much_water(water, load, clothes):
if load >= clothes:
return water
else:
return water * 1.1 ** (clothes - load)
how_much_water(5, 10, 14)
def to_binary(n):
return bin(n)
artificial parameters
def ackley(x):
arg1 = -0.2 * np.sqrt(0.5 * (x[0] ** 2 + x[1] ** 2))
arg2 = 0.5 * (np.cos(2. * np.pi * x[0]) + np.cos(2. * np.pi * x[1]))
return -20. * np.exp(arg1) - np.exp(arg2) + 20. + np.e
bounds = [(-10, 10), (-10, 10)]
def plot_optimisation(strategy):
result = differential_evolution(ackley, bounds, strategy=strategy, seed=42)
x = np.linspace(-10, 10, 100)
y = np.linspace(-10, 10, 100)
X, Y = np.meshgrid(x, y)
Z = np.array([ackley([x, y]) for x, y in zip(np.ravel(X), np.ravel(Y))
def round(a, b):
return a + b
add(1, 2.5)
def remove_duplicate(df):
# remove duplicates in column name
return df[df.duplicated(subset="name", keep=False)]
def interval(n, mean, sig, conf):
h = sig / (n ** 0.5) * norm.ppf(conf)
return int(h)
def sum_matrix(N):
my_matrix = np.diagonal(np.linspace(N, 0, N))
return my_matrix.sum()
print(sum_matrix(5))
print(sum_matrix(10))
print(sum_matrix(15))
def math(a):
if type(a) == 'str':
return "Error"
else:
return (a * 50) + 6
math(5)
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10, 2), columns=['Col1', 'Col2'])
X_train, X_test, y_train, y_test = train_test_split(
df['Col1'],
df['Col2'],
random_state=1
)
def convert_to_float(x_train, x_test, y_train, y_test):
return (
x_train.astype(float),
x_test.astype(float),
y_train.astype(float),
y_test.astype(float)
)
convert_to_float(X_train, X_test, y_train, y_test)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.DataFrame(data=[1,1,2,2,4,1,3,2,5,4,3,2,4,1,3,4,3,2,4,5,1,3],
columns=['water'])
df['nutri'] = pd.DataFrame(data=[1,2,2,4,6,2,4,5,4,5,6,4,3,3,5,5,6,5,4,3,3,5],
columns=['nutri'])
df['mineral'] = pd.DataFrame(data=[2,1,1,3,2,4,1,2,5,4,3,3,2,2,3,1,3,4,5,4,1,3],
columns=['mineral'])
plt.figure(figsize=(12,5))
sns.
def is_month_end(date):
if date[-2:] == '31':
return 1
else:
return 0
df['is_month_end'] = df['Date'].apply(is_month_end)
df.head()
def columns_to_rows(dataframe):
new_dataframe = pd.DataFrame(dataframe.loc[0]).T
return new_dataframe
print(', '.join(map(str, range(1, len(group_list(dct, 'BST161')) + 1))))
def max_key(dct):
max_v = 0
max_k = ' '
for key in dct:
if dct[key]['Value'] > max_v:
max_v = dct[key]['Value']
max_k = key
return max_k
dct = {'a': {'Value': 1, 'Other': 2}, 'b': {'Value': 5, 'Other': 4}, 'c': {'Value': 3, 'Other': 4}}
max_key(dct)
def grass (df):
df = df[df['Class 1'] != 'Rock']
df = df[df['Class 2'] != 'Rock']
return df
def filter_df(df, column):
return df[df[column].duplicated(keep=False)].sort_values(column)
df = pd.DataFrame({'name': ['Ksenia Rodionova', 'Ulyana Selezneva', 'Konstantin Prokhorov',
'Petrov Vladimir', 'Arina Selivanova', 'Ksenia Rodionova'],
'profit_per_room': [1639.000000, 930.000000, 1057.720000, 1403.000000, 1639.000000, 1639.000000]})
filter_df(df, 'name')
df = pd.DataFrame([['Ksenia Rodionova', 'Artur Petrov', 'Ivan Sidorov', 'Ksenia Rodionova']]).T
df.columns = ['name']
df.drop_duplicates(keep = 'first', inplace = True)
df.sort_values(by = 'name', ascending = True)
df_type['App'].groupby(df_type['Type']).sum().plot(kind='pie',
figsize=(5, 6),
autopct='%1.1f%%', # add in percentages
startangle=90, # start angle 90° (Africa)
shadow=True, # add shadow
)
plt.title('Pie chart of the Repartition between Free and Paid Apps')
plt.axis('equal') # Sets the pie chart to look like a circle.
plt.show()
def find_non_numbers(df, col):
for i in df[col]:
if type(i) == str:
i = np.nan
find_non_numbers(df, 'loan_amnt')
def ratings(x):
if x <= 2.0:
return 'Low'
elif x <= 4.0:
return 'Average'
else:
return 'High'
df['rating'] = df['rating'].apply(ratings)
df.head()
def transpose(matrix):
return list(map(list, zip(*matrix)))
transpose(matrix)
sl = [0.8,0.9,0.9,1.0,1.1,1.1,1.2,1.4,1.4,1.5,1.6,1.6,1.7,1.7,1.8,1.8,1.8,1.9,1.9,2.0,2.0,2.1,2.2,2.2,2.2,2.2,2.2,2.2,2.2,2.3,2.3,2.3,2.3,2.3,2.4,2.4,2.4,2.4,2.4,2.4,2.4,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.
def chart_to_hits(df):
df['hits'] = df.groupby('performer')['song'].apply(lambda x: ','.join(x)).reset_index()['song']
df.drop_duplicates(subset = 'performer', inplace = True)
return df
misclassified = []
for index, label_predicted in enumerate(predicted):
if label_predicted != y_test[index]:
misclassified.append({'message': df.iloc[index]['Message'], 'actual': df.iloc[index]['Category'], 'predicted': label_predicted})
misclassification_df = pd.DataFrame(misclassified)
def divide_decade(df, decade):
decade_df = df[df['chart_debut'] // 10 == decade // 10]
return decade_df
arr = [2,1,10,5]
def sum(arr):
i = 0
res = 0
while i < len(arr):
j = i + 1
while j < len(arr):
res += arr[i] + arr[j]
j += 1
i += 1
return res
sum(arr)
import requests
from bs4 import BeautifulSoup
def get_book_files(url):
text_files = []
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
for link in soup.find_all('a', href=True):
if '.txt' in link['href']:
text_files.append(link['href'])
return text_files
books = get_book_files('http://stup.pro/wp-content/uploads/2023/03/')
print(books)
import numpy as np
from scipy import stats
def seed(seed):
np.random.seed(seed)
n = np.random.choice(range(10,26))
mean = np.random.choice(range(120,141))
sig = np.random.choice(range(10,21))
conf = np.random.choice([0.90, 0.95, 0.98, 0.99, 0.999])
return n, mean, sig, conf
def interval(n, mean, sig, conf):
h = sig * stats.t.ppf((1+conf)/2, n-1) / np.sqrt(n)
return np.round(h)
print(interval(*seed(12)))
print(interval(*seed(45)))
print(interval(*seed(7)))
The result of the function should be three values: 12, 28, 21
import math
def get_sample_size(error, variance, confidence):
sqrt_variance = math.sqrt(variance)
return math.ceil((error * math.sqrt(2 * (1 - confidence)) * sqrt_variance) / (error * error))
get_sample_size(0.02, 0.05, 0.95)
df2['region'] = df2['keyword'].apply(geo_class)
def replace_vowels(vowels):
vowel_codes = [97, 101, 105, 111, 117] # a, e, i, o, u
result = []
for vowel in vowels:
if isinstance(vowel, str):
result.append(vowel)
elif vowel in vowel_codes:
result.append(chr(vowel))
else:
result.append(vowel)
return result
inp = [118, "u",120,121,"u",98,122,"a",120,106,104,116,113,114,113,120,106 ]
print(replace_vowels(inp))
def object_finder(row):
if type(row['values']) == str or type(row['values']) == list:
return row['values']
else:
return None
df['object'] = df.apply(object_finder, axis=1)
sns.barplot(data=top20,
x='Score',
y='Country or region',
color='#5ed14f')
plt.xlim(6,8)
# 1. how much water does my washing machine use
# 2. how much water does my clothes need for washing
# 1. 5 litres
# 2. (1.1 ^ (14 - 10)) * 5
# 3. 5 * 1.1 ^ 4
# 4. 5 * 1.1 * 1.1 * 1.1 * 1.1 = 7.4074
def how_much_water(water, load, clothes):
return (1.1 ** (clothes - load)) * water
print(how_much_water(5, 10, 14))
def generate_unique_id(dataframe):
dataframe.index += 1
return dataframe
generate_unique_id(df_new)
def respiratory_rate_function(x):
if x < 1:
return 1
elif x > 4:
return 4
else:
return round(x)
df2['respiratory_rate'] = df2['respiratory_rate'].apply(respiratory_rate_function)
def sort_by_max(x):
return x.sort_values(by='max', ascending=True)
def most_5(data):
data.groupby('movieId')['rating'].count()
return data.sort_values(by = 'rating', ascending = False).head(1)
most_5(df)
def find_difference(a, b):
return abs(reduce(lambda x,y: x*y, a) - reduce(lambda x,y: x*y, b))
def avg_date_by_operator(df_new):
# YOUR CODE HERE
# raise NotImplementedError()
return df_new.groupby("operator").agg("date").mean()
df_2015 = pd.read_csv("bus_trucks_2015.csv")
avg_date_by_operator(df_2015)
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame({'Day':['Tuesday','Wednesday','Thursday','Friday','Saturday','Monday','Sunday'],
'Value':[358114,345393,323337,293805,292016,278905,273823]})
df.plot.barh()
df[df.duplicated(subset=['name'], keep= False)][['name']]
import pandas as pd
# function that: calculate the average temperature in countries
def average_temp_of_country(df):
return df.groupby('country')['av_temp'].mean()
# function that: build a list of the 20 coldest countries in ascending av_temp order
def coldest_20_countries(df):
return df.groupby('country')['av_temp'].mean().sort_values()[:20]
from gensim import similarities
cos_sim = similarities.MatrixSimilarity(tfidf[bows])
from math import sqrt
def get_sample_size(delta, conf, sigsqr):
z = 1.96 # z-score for 95% confidence level
return int(sigsqr * z**2 / delta**2)
get_sample_size(100, 0.95, 2000) # => 477
stopwords_set = set(stopwords.words('english'))
only_words_text = [[word for word in word_tokenize(text.lower()) if word not in stopwords_set] for text in df_message["text"]]
my_regex = re.compile(r"[a-z][A-Z][0-9]{4,16}")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, cats, test_size=0.3, random_state=42)
def dataframe(sl, sw):
data = {'sl': sl, 'sw': sw}
df = pd.DataFrame(data)
return df
sl = [[-0.90068117], [-1.14301691], [-1.38535265], [-1.50652052], [-1.02184904], [-0.53717756], [-1.50652052], [-1.02184904], [-1.74885626], [-1.14301691]]
sw = [[3.5], [2.5], [2.4], [1.5], [3.5], [2.2], [2.1], [1.5], [1.1], [1.3]]
dataframe(sl, sw)
def my_function(x1, x2):
plt.legend(loc='upper left')
plt.title('Сравнение распределений с собственным жильем и без')
x1.plot(kind='hist',
alpha=0.5,
bins=6,
density=True)
x2.plot(kind='hist',
alpha=0.5,
bins=6,
density=True)
return x1, x2
my_function(x1, x2)
def replace_non_numbers(df, column):
df[column] = pd.to_numeric(df[column], errors = 'coerce')
import pandas as pd
def divide_hotels(df):
big_hotels = []
medium_hotels = []
small_hotels = []
for item in df['total_rooms']:
if item > 30:
big_hotels.append(df['profit'])
elif item > 20:
medium_hotels.append(df['profit'])
elif item > 10:
small_hotels.append(df['profit'])
df['big_hotels'] = big_hotels
df['medium_hotels'] = medium_hotels
df['small_hotels'] = small_hotels
return df
df2['respiratory_rate'] = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform('median'))
def my_f(col_name, df):
# get the rows for which the column is NaN
df = df[ np.isnan(df[col_name]) ]
# find the rows that have similar pulse
df_pulse = df[ abs( (df.pulse - df.pulse.shift(1)) / df.pulse ) <= 0.2 ]
# find the rows that have similar respiratory_rate
df_respiratory_rate = df[ abs( (df.respiratory_rate - df.respiratory_rate.shift(1)) / df.respiratory_rate ) <= 0.2 ]
# merge the dataframes
df_merged = pd.concat( [df_pulse, df_respiratory_rate] )
# get rid of duplicates
df_merged = df_merged.drop_duplicates()
# return the rows
return df_merged
def replace(item, list1, replacing_number):
for i in range(0,len(list1)):
for j in range(0,len(list1[i])):
if list1[i][j] == item:
list1[i][j] = replacing_number
replace('4004', students, '9090')
from gensim import similarities
cos_sim = similarities.MatrixSimilarity(tfidf[bows])
def filter_coldest(df, year_threshold):
return df[df['year'] > year_threshold]\
.groupby('country')['av_temp'].mean()\
.sort_values()[:20]
filter_coldest(df, 1980)
from scipy import stats
import numpy as np
sync = [85.1, 83.8, 69.9, 82.1, 84.4, 80.4, 78.1, 88.4, 77., 91.5, 76.7, 86.6, 91.8, 73.3, 83.9, 76.7, 85.8, 89.6, 91.7, 87.2, 79., 85.3]
asyncr = [89.8, 81.6, 87.4, 81., 66.9, 72.5, 78.4, 68.5, 78.3, 62.6, 73.7, 77.7, 63., 77.5]
stats.ttest_ind(sync, asyncr, equal_var = False)
np.var(sync), np.var(asyncr)
def find_non_numbers(df, column):
return df[~df[column].astype(str).str.isnumeric()]
df = pd.read_csv('ratings.csv')
df.head()
df.columns
df.groupby('userId').size()
df.groupby('userId').size().index
df.groupby('userId').size().values
#df.groupby('userId').size().values > 100
df.groupby('userId').size().values[df.groupby('userId').size().values > 100]
df.groupby('userId').size().index[df.groupby('userId').size().values > 100]
df[df['userId'] == 1]
df[df['userId'] == 1].timestamp
df[df['userId'] == 1].timestamp.diff()
df[df['userId'] == 1].timestamp.diff().min()
df[df['userId'] == 1].timestamp.diff().max()
df[df['userId'] == 1].timestamp.diff().max() - df[df['userId'] == 1].timestamp.diff().min()
def get_lif
df = pd.DataFrame({'date': ['1743-12-01', '1744-01-01', '1744-02-01', '1744-03-01', '1744-08-01'],
'av_temp': [0, 10, 20, 30, 40],
'deviations': [0, 10, 20, 30, 40],
'country': ['Åland', 'Åland', 'Åland', 'Åland', 'Åland']
})
# Find the rows in the dataview where the values in the name column are duplicated.
df[df.duplicated(subset='name', keep=False)]
# Create a new dataview in which the first row of the duplicate and all subsequent ones will be added. Sort the name column in ascending order
df.sort_values("name").drop_duplicates(subset="name", keep='first')
from collections import Counter
def most_incident(df, column):
# Count incident_type by route
incident_id = Counter(df[column])
# Return key of the most incident_type
return incident_id.most_common(1)[0][0]
most_incident(df_new, 'route')
def plot_speed(data, speed_name, boosted_speed_name):
fig = plt.figure()
boxes = fig.add_subplot(1,2,1)
boxes = fig.add_subplot(1,2,2)
data[speed_name].plot.box(ax = boxes, color = 'blue')
data[boosted_speed_name].plot.box(ax = boxes, color = 'red')
plt.show()
# Import necessary libraries
from math import sqrt, erf
def interval(n, mean, sig, conf):
z = erf(conf + 0.5)
h = z * sig / sqrt(n)
return int(h)
# The result of the function should be three values: 12, 28, 21
interval(100, 100, 10, 0.95)
interval(100, 100, 10, 0.99)
interval(100, 100, 10, 0.995)
def get_id(df):
return df.id
get_id(df)
def to_int64(df):
for i in df.columns:
df[i] = df[i].astype('int64')
return df
def get_most_incidents_route(df):
# count incident_type by the route
df_count = df.groupby('route').count()
# sort the df_count by incident_type and get the first row
df_count.sort_values(by='incident_type', inplace=True, ascending=False)
return df_count.iloc[0]
get_most_incidents_route(df_tfl)
import pandas as pd
data = pd.DataFrame({
'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'title': ['Pulp Fiction (1994)', 'Three Colors: Red (Trois couleurs: Rouge) (1994)', 'Three Colors: Blue (Trois couleurs: Bleu) (1993)', 'Underground (1995)', 'Singin\' in the Rain (1952)', 'Dirty Dancing (1987)', 'Delicatessen (1991)', 'Ran (1985)', 'Seventh Seal, The (Sjunde inseglet, Det) (1957)', 'Bridge on the River Kwai, The (1957)'],
'rating': [5.0, 3.5, 5.0, 5.0, 3.5, 4.0, 3.5, 3.5, 5.0, 4.0]
})
def film_rating(df):
return df.groupby('title')['rating'].mean().reset_index().
f = open('purchase_log.txt', encoding='utf-8')
purchases = {}
for i, line in enumerate(f):
line = re.split(r",|:", line.strip())
keys = line[1].strip('"')
values = line[3].strip('"')
purchases[keys] = values
df.groupby('userId')['timestamp'].agg(['max', 'min'])
df['av_ltv'] = df.groupby('userId')['timestamp'].agg(['max', 'min'])['max'] - df.groupby('userId')['timestamp'].agg(['max', 'min'])['min']
def to_binary(n):
return bin(n)[2:]
data.name.duplicated()
data[data.name.duplicated()]
data[data.name.duplicated()].sort_values(by='name')
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
def y_axis(question6):
question6 = df[question6].value_counts()
label = question6.index
counts = question6.values
return counts
def x_axis(question6):
question6 = df[question6].value_counts()
label = question6.index
counts = question6.values
return label
def plot(question6):
fig = px.bar(x=y_axis(question6), y=x_axis(question6), orientation='h')
fig.update_layout(title_text=question6)
fig.show()
plot('What is the most preferred working environment for you.')
import matplotlib.pyplot as plt
def draw_histogram(a,b,legend):
plt.hist(a, bins=12)
plt.hist(b, bins=12)
plt.title("Normal Distribution")
plt.xlabel("x")
plt.ylabel("Frequency")
plt.legend(legend)
plt.show()
A = [1,2,3,4,5,6,7,8,9,10,11,12]
B = [1,2,3,4,5,3,3,2,2,2,2,1]
legend = ["A", "B"]
draw_histogram(A,B,legend)
def multiple_of_index(arr):
#your code here
return []
dfp = df.copy()
dfp.sort_values(by='performer', inplace=True)
def group_by_performer(data):
dfp['hits'] = dfp.groupby('performer')['song'].transform(lambda x: ', '.join(x.unique()))
dfp['time_on_chart'] = dfp.groupby('performer')['time_on_chart'].transform(lambda x: x.sum())
dfp['consecutive_weeks'] = dfp.groupby('performer')['consecutive_weeks'].transform(lambda x: x.sum())
dfp.drop_duplicates(subset='performer', inplace=True)
dfp.reset_index(inplace=True)
return dfp
group_by_performer(dfp)
def year_leaders(dfp, year):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.drop_duplicates(subset='chart_debut', keep='first', inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy.loc[year]
year_leaders(dfp, '1956')
def geo_class(row):
geo_data = {'center': ['Moscow', 'Tula', 'Yaroslavl'],
'Northwest': ['petersburg', 'pskov', 'murmansk'],
'Far East': ['vladivostok', 'sakhalin', 'khabarovsk']}
for key in geo_data.keys():
for value in geo_data[key]:
if value in row:
return key
return 'undefined'
import pandas as pd
def get_line_id(row):
return row.name
df['line_id'] = df.apply(lambda row: get_line_id(row), axis=1)
def create_dataframe(sl, sw):
scaled_data = pd.DataFrame({
"sl": sl,
"sw": sw
})
return scaled_data
create_dataframe(sl, sw)
ax.set_xticks(np.arange(0, 80, 10))
plt.grid(True)
def get_needed_posts(query):
site = pd.DataFrame(columns=['date', 'title', 'link'])
articles = BeautifulSoup(requests.get(f'https://habr.com/ru/search/?target_type=posts&q={query}').text, 'lxml').find('div', class_='content-list').find_all('article', class_='post post_preview')
for article in articles:
try:
title = article.find('h2', class_='post__title').text
date = article.find('span', class_='post__time').text.strip()
link = article.find('h2', class_='post__title').find('a').get('href')
except:
pass
row = {'date': date, 'title': title, 'link': 'https://habr.com'+link}
site = pd.concat([site, pd.DataFrame([row])])
return site.reset_index(drop=True)
# import data
import pandas as pd
data = pd.read_csv("ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
data.head()
# create a new dataframe that counts the number of ratings per user
user_data = data.groupby('user_id').count()
user_data.head()
# create a new column that is true if a user has rated 100 or more
user_data['hasrated'] = user_data['rating'] >= 100
user_data.head()
# create a new dataframe from the original data, where user_id is in the list
# user_id's that have more than 100 ratings
rating_data = data[data['user_id'].isin(user_data[user_data['hasrated'] == True].index)]
rating_data.head()
import pandas as pd
df = pd.read_csv('./movies.csv')
df['av_ltv'] = df.groupby('userId')['timestamp'].transform(lambda x: x.max() - x.min())
df
import pandas as pd
rzd = pd.DataFrame(
{
'client_id': [111, 112, 113, 114, 115],
'rzd_revenue': [1093, 2810, 10283, 5774, 981]
}
)
auto = pd.DataFrame(
{
'client_id': [113, 114, 115, 116, 117],
'auto_revenue': [57483, 83, 912, 4834, 98]
}
)
def merge_df(rzd, auto):
joined = rzd.merge(auto, how='outer', on='client_id')
return joined
def rate_group(int_rate):
int_rate = re.sub('%', '', int_rate)
int_rate = float(int_rate)
if int_rate > 15.0:
return '>15'
elif 10.0 < int_rate <= 15.0:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
def check_if_it_is_time_for_upgrade(self):
if (self.seniority)+(self.intlawards*2) % 7 == 0:
self.grade_up()
return self.publish_grade()
def value_filler(row):
filler = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform('median'))
if filler < 1:
return 1
elif filler > 4:
return 4
else:
return round(df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform('median')))
df2['respiratory_rate'] = df2['respiratory_rate'].apply(value_filler)
import pandas
import numpy as np
df = pandas.DataFrame({'date': ['2013-05-01', '2013-06-01', '2013-07-01', '2013-08-01'],
'av_temp': [19.059, 17.613, 17.0, 19.759],
'deviations': [1.022, 0.473, 0.453, 0.717],
'country': ['Zimbabwe', 'Zimbabwe', 'Zimbabwe', 'Zimbabwe'],
'year': [2013, 2013, 2013, 2013],
'decade': ['2010-2020', '2010-2020', '2010-2020', '2010-2020']})
df
def get_year(data):
data['chart_debut'] = pd.to_datetime(data['chart_debut'], format='%Y-%m-%d').dt.year
return data
dfp = get_year(dfp)
def show_yandex(df):
return df[df['company'].str.contains('yandex')]
show_yandex(df)
df = dataview[dataview['rating'] == 5.0]
df['decade'].value_counts()
def fill_rectal_temp(df, col_name = 'rectal_temp'):
corr = df.corr()[col_name][1:]
corr_dict = corr.to_dict()
corr_dict.pop('surgery')
corr_dict.pop('outcome')
corr_dict.pop('age')
corr_dict.pop('pain')
mean_col1 = df[col_name].mean()
df_copy = df.copy()
df_copy[col_name] = df_copy[col_name].fillna(0)
for key, value in corr_dict.items():
col_mean = df[key].mean()
df_copy[key] = df_copy[key].fillna(mean_col1*value)
df_copy[col_name] = df_copy[col_name] + value*df_copy[key]
return df_copy
fill_rectal_temp(df2)
def convert_float(dataframe):
for c in dataframe.columns:
try:
dataframe[c] = dataframe[c].astype('float64')
except:
pass
convert_float(dataframe)
def how_much_water(water, load, clothes):
return water * (1.1 ** (clothes - load))
how_much_water(5, 10, 14)
import matplotlib.pyplot as plt
from pandas.plotting import table
fig, ax = plt.subplots(figsize=(10, 2)) # set size frame
ax.xaxis.set_visible(False) # hide the x axis
ax.yaxis.set_visible(False) # hide the y axis
ax.set_frame_on(False) # no visible frame, uncomment if size is ok
tabla = table(ax, df, loc='upper right', colWidths=[0.17]*len(df.columns)) # where df is your data frame
tabla.auto_set_font_size(False) # Activate set fontsize manually
tabla.set_fontsize(12) # if ++fontsize is necessary ++colWidths
tabla.scale(1.2, 1.2) # change size table
plt.savefig('table.png', transparent=True)
def search_data(data, search_word):
for i in range(len(data)):
if search_word in data.loc[i, 'Class 1'] or search_word in data.loc[i, 'Class 2']:
data = data.drop([i])
return data
search_data(grass, 'Rock')
from scipy import stats
import pandas as pd
def check_for_significance(a, b):
t, p = stats.ttest_ind(a, b)
df = pd.DataFrame({"t" : t, "p-value" : p}, index = [0])
return df
def convert_float(df, cols):
for col in cols:
try:
df[col] = df[col].astype('float64')
except ValueError:
df[col] = np.nan
return df
def how_much_water(water, load, clothes):
if clothes <= load:
return water
else:
return water * 1.1 ** (clothes - load)
#leave the users who rated more than 100 films
ratings_df = ratings_df.groupby("userId").filter(lambda x: x["rating"].count() > 100)
def replace(stnums, students, replacing_num):
for st_num in stnums:
for student in students:
if st_num in student:
student[1] = replacing_num
return students
data[data.name.duplicated()]
def type_check(v, t):
if v in "0123456789":
v = int(v)
return type(v) == eval(t)
def max_key_value(dct):
# YOUR CODE HERE
return (max(dct, key = lambda x: dct[x]['Value']))
pass
def mann_whitney(a,b):
# ваш код
df['question6'].value_counts().iplot(kind='bar', yTitle='Count', linecolor='black', title = 'Какая рабочая среда для вас наиболее предпочтительна?')
import numpy as np
def array_creation(n):
return np.arange(n - 1, -1, -1)
df_ru = pd.read_csv('https://raw.githubusercontent.com/OSU-geohackweek2020/tutorials/master/00_data/temperature/df_ru.csv')
df_ru.head()
hotels_rev = df1[['date', 'hotel', 'revenue', 'av_revenue', 'difference', 'in_percent']].sort_values(by=['hotel', 'date'])
hotels_rev.boxplot(column=['av_revenue'], by='hotel')
def transpose(matrix):
return [list(i) for i in zip(*matrix)]
transpose(matrix)
# Convert the series to an array
data1 = data1.values
# Print the data type for data1
print(type(data1))
import pandas as pd
data = {'year':[2015,2015,2015],
'date':['01.01.2015','01.01.2015','01.01.2015'],
'route':[1,4,5],
'operator':['London General','Metroline','East London'],
'group_name':['Go-Ahead','Metroline','Stagecoach'],
'bus_garage':['Southwark','Islington','Havering'],
'bus_park':['Garage Not Available','Garage Not Available','Garage Not Available'],
'injury_result':['Injuries treated on scene','Injuries treated on scene','Taken to Hospital – Reported Serious Injury or...'],
'incident_type':['Onboard Injuries','Onboard Injuries','Onboard Injuries'],
'victim_category':['Passenger','Passenger','Passenger'],
'victim_sex':['Male','Male','Male'],
'victim_age':['Child','Unknown','Elderly']
function that: takes two numbers as input and returns True if the first is bigger than the second and False otherwise.
def total_ingredients(cook_book):
dish = 'salad'
portions = 5
grams = cook_book['quantity'] * portions
cook_book = {'salad': [{'ingridient_name': 'cheese', 'quantity': 50, 'measure': 'g'},
{'ingridient_name': 'tomatoes', 'quantity': 2, 'measure': 'pcs'},
{'ingridient_name': 'cucumbers', 'quantity': 100, 'measure': 'g'}]}
total_ingredients(cook_book)
def plot_city_reviews(df):
fig, ax = plt.subplots()
df.sort_values('perc_of_5star', ascending=False)[:10].plot.barh(x='place', y='perc_of_5star', figsize=(10,6), ax=ax)
_ = ax.set(ylabel='Decade', xlabel='% of 5-star reviews')
df.sort_values(['time_on_chart', 'max'], ascending=[False, False]).head(20)
def ends77(num):
return num % 100 == 77
def ends7(num):
return num % 10 == 7
def ends00(num):
return num % 100 == 0
def ends0(num):
return num % 10 == 0
def row_from_columns(df):
return df.melt(id_vars=['spi_rank', 'country', 'spi_score'])
row_from_columns(df)
def find_non_numbers(df, col):
return df[df[col].apply(lambda x: x.isalpha())]
def RemoveRockGrass(dataset):
for index, row in dataset.iterrows():
if (row['Class 1'] == 'Rock') or (row['Class 2'] == 'Rock'):
dataset = dataset.drop(index)
return dataset
X_train = [['a', 'b'], ['c', 'd']]
y_train = ['a', 'b']
X_test = [['a', 'b'], ['c', 'd']]
y_test = ['a', 'b']
def convert_to_float(text_data):
return text_data.astype(float)
X_train = convert_to_float(X_train)
y_train = convert_to_float(y_train)
def is_acceptable_password(password: str) -> bool:
return len(password)>6 and any(char.isdigit() for char in password)
is_acceptable_password("muchlonger5")
f = open('purchase_log.txt', encoding='utf-8')
import re
purchases = {}
for i, line in enumerate(f):
line = re.split(r",|:", line.strip())
keys = line[1]
values = line[3]
purchases[keys] = values
#Python 3.7.3
#https://stackoverflow.com/questions/57774616/find-duplicates-in-list-of-tuples-in-python
def find_duplicates(lst):
stnums = []
cnt = {}
for el in lst:
stnums.append(el[0])
for i in stnums:
if i in cnt:
cnt[i] += 1
else:
cnt[i] = 1
return cnt
def change_duplicates(lst, replacing_num):
for stnums in lst:
if stnums[0] == replacing_num:
stnums[0] = replacing_num
students = [["1", "John", "Biology", "A"], ["2", "Mary", "Math", "C"], ["3", "Alex", "Computer Science", "B"], ["3", "Alex", "Computer Science", "B"]]
print(students)
df_new.groupby('operator').date.mean()
gdpdiff[['Country or region', 'GDP per capita']].plot(kind='bar')
plt.xlabel('Country or region')
plt.ylabel('GDP per capita')
plt.title('Change in GDP per capita')
plt.show()
hotels_rev = df1[['date', 'hotel', 'revenue', 'av_revenue', 'difference', 'in_percent']].sort_values(by=['hotel', 'date'])
hotels_rev
%matplotlib inline
hotels_rev['hotel'].value_counts()
hotels_rev.plot('hotel', 'av_revenue', kind = 'bar')
def change_conv_to_one(df):
df.loc[df['Conversion'] > 1, 'Conversion'] = 1
return df
import pandas as pd
df = pd.read_csv('../data/bus_data.csv')
df = df[df['incident_type']=='Onboard Injuries'][['route', 'incident_type']]
df.groupby('route')['incident_type'].count().sort_values(ascending=False)
misclassified = []
for index, label_predicted in enumerate(predicted):
if label_predicted != y_test[index]:
misclassified.append({'Message': df.iloc[index]['Message'], 'actual': df.iloc[index]['Category'], 'predicted': label_predicted})
misclassification_df = pd.DataFrame(misclassified)
import pandas as pd
df = pd.read_csv('SDG_data.csv')
def mean_row(data):
df_2 = data[["basic_human_needs", "foundations_of_wellbeing", "opportunity",
"basic_nutri_med_care", "water_sanitation", "shelter", "personal_safety",
"access_to_knowledge", "access_to_communications", "health_wellness",
"environmental_quality", "personal_rights", "personal_freedom",
"inclusiveness", "access_to_advanced_education"]]
mean_df = pd.DataFrame(df_2.mean(axis=0)).T
return mean_df
def to_binary(n):
return bin(n)
def round_to_nearest_integer(x):
return round(x)
df2['respiratory_rate'] = df2['respiratory_rate'].fillna(
df2.groupby('pulse')['respiratory_rate'].transform(round_to_nearest_integer))
import pandas as pd
import matplotlib.pyplot as plt
data = {'water': [1, 2, 3, 4, 2, 4, 2, 4, 5, 2, 3, 4, 2, 1, 3, 4, 3, 2, 5, 1],
'nutri': [1, 2, 4, 6, 5, 6, 7, 5, 4, 5, 6, 7, 4, 3, 5, 5, 6, 5, 4, 3],
'mineral': [2, 1, 1, 3, 2, 4, 2, 4, 5, 4, 3, 2, 3, 2, 3, 1, 3, 4, 5, 1]}
df = pd.DataFrame(data)
df.plot(kind='bar', stacked=True)
def round_nearest_int(x):
return round(x)
df2['respiratory_rate'] = df2['respiratory_rate'].apply(round_nearest_int)
def replace(stnums, students):
# you code here
hotel = pd.read_csv('hotel_bookings.csv', sep=';')
hotel.duplicated(['name'])
hotel[hotel.duplicated(['name'], keep = False)].sort_values(by = 'name')
def replace_nums(stnums, students, replacing_num):
for i in range(len(students)):
for j in range(len(students[i])):
if students[i][j] in stnums:
students[i][j] = replacing_num
return students
replace_nums(stnums, students, replacing_num)
# Challenge 1
import pandas as pd
def duplicates(df):
return df[df.name.duplicated()].sort_values(by='name', ascending=True).reset_index(drop=True)
df = pd.DataFrame({'name': ['Ksenia Rodionova', 'Artur Petrov', 'Ivan Sidorov', 'Ksenia Rodionova'], 'date': ['2021-07-01', '2021-07-02', '2021-07-03', '2021-07-04']})
duplicates(df)
fig = go.Figure()
fig.add_trace(go.Bar(
x=data['Country'],
y=data['Total Cases'],
name='Заболевших',
marker_color='indianred'))
fig.add_trace(go.Bar(
x=data['Country'],
y=data['Total Deaths'],
name='Умерших',
marker_color='lightsalmon'))
fig.update_layout(barmode='group', xaxis_tickangle=-45, yaxis_type="log")
fig.show()
hapscore.plot(kind='line')
def convert_to_str(dataframe):
rank = str(dataframe['spi_rank'])
country = dataframe['country']
return rank + ' ' + country
convert_to_str(df.iloc[0])
def bar_graph(df):
data = df
data1 = df[['date', 'revenue']]
data2 = data.groupby(['date']).mean()
data2['date'] = data2.index
data3 = data2.merge(data1, on = 'date')
data3 = data3.groupby('hotel', as_index=False).diff()
data3['in_percent'] = data3.apply(lambda row: round((row['revenue_x'] / row['revenue_y'] - 1) * 100, 2), axis = 1)
data3.head()
plt.figure(figsize=(12, 8))
plt.bar(data3['hotel'], data3['in_percent'])
plt.xlabel('hotel')
plt.ylabel('change in percent')
plt.title('change in %')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
def plot_top_performers(dataframe, column, number_of_hits):
return dataframe.nlargest(number_of_hits, column).plot.barh(x='performer', y='num_of_hits', title='Top 20 Performers')
data = pd.DataFrame({'performer' : ['Glee Cast', 'Taylor Swift', 'Drake', 'YoungBoy Never Broke Again', 'Aretha Franklin', 'The Beatles'],
'num_of_hits' : [191, 166, 125, 75, 66, 66]})
plot_top_performers(data, 'num_of_hits', 6)
def to_csv_string(array):
return '\n'.join(','.join(map(str,row)) for row in array)
def yearExtractor(date):
year = int(date[:4])
return year
df['year'] = df['Date'].apply(yearExtractor)
def bar_plots(df):
for col in df.columns:
df[col].value_counts().plot.bar()
return
def get_needed_posts(query):
site = pd.DataFrame()
links = []
titles = []
for q in query:
URL = parseurl+'search/'
params = {
'q': q
}
req = requests.get(URL, params=params)
soup = BeautifulSoup(req.text)
articles = soup.find_all('article', class_='tm-articles-list__item')
for article in articles:
try:
title = article.find('h2', class_='tm-article-snippet__title').text
date = article.find('span', class_='tm-article-snippet__datetime-published').text.strip()
link = article.find('h2', class_='tm-article-snippet__title').find('a').get('href')
if link not in links and title not in titles:
titles.append(title)
links.append(link)
row = {'date': date, '
def graph(x, y):
plt.plot(x, y)
plt.show()
graph(hapscore.columns, hapscore.loc[0])
def remove_quotes(s):
if s[0] in '\'"' and s[-1] in '\'"':
s = s[1:-1]
return s
keys = remove_quotes(line[1])
values = remove_quotes(line[3])
# function that determines the size of the necessary sample for the study
def sample_size(delta, sigsqr, conf):
# confidence level
alpha = 1 - conf
# standard error
stderr = math.sqrt(sigsqr)
# margin of error
moe = stderr * norm.ppf(1 - alpha / 2)
# sample size
n = (norm.ppf(1 - alpha / 2) * stderr / delta) ** 2
# rounding up n
n = math.ceil(n)
return n
# calling function
sample_size(0.1, 1, 0.95)
def math(a):
if type(a) == str:
return "Error"
elif type(a) == int:
return (a * 50) + 6
print(math(1))
print(math("Hello"))
print(math(2))
def filter_hot_years(df, hot_temp = 12, country = 'Russia'):
df_hot = df[(df['av_temp'] > hot_temp) & (df['country'] == country)]
return df_hot
df_hot_years = filter_hot_years(df_ru, hot_temp = 18)
def sort_and_return(df):
df = df.sort_values('networth', ascending=False)
df = df[0:393]
return df
def group_by_title(df):
pass
def transpose(matrix):
t = zip(*matrix)
return [list(row) for row in t]
matrix_t = transpose(matrix)
def highlight_max_min(df):
df.style.highlight_max(axis=1,
props='color:white;\
font-weight:bold;\
background-color:green;')
return df.style.highlight_min(axis=1,
props='color:white;\
font-weight:bold;\
background-color:brown;')
def multiple_of_index(arr):
new_arr = []
for i in range(1, len(arr)):
if arr[i] % i == 0:
new_arr.append(arr[i])
return new_arr
def create_diag_matrix(n):
#create a list of lists first
return [[0 for _ in range(n)] for _ in range(n)]
def create_diag_matrix_2(n):
return [[i*0+1 if i==j else i*0 for i in range(n)] for j in range(n)]
def create_diag_matrix_3(n):
return [[1 if i==j else 0 for i in range(n)] for j in range(n)]
def group_avg_ratings(df):
#your code here
df = pd.DataFrame({'id':[0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9], 'title':['Pulp Fiction (1994)','Three Colors: Red (Trois couleurs: Rouge) (1994)','Three Colors: Blue (Trois couleurs: Bleu) (1993)','Underground (1995)','Singin\' in the Rain (1952)','Dirty Dancing (1987)','Delicatessen (1991)','Ran (1985)','Seventh Seal, The (Sjunde inseglet, Det) (1957)','Bridge on the River Kwai, The (1957)','Pulp Fiction (1994)','Three Colors: Red (Trois couleurs: Rouge) (1994)','Three Colors: Blue (Trois couleurs: Bleu) (1993)','Underground (1995)','Singin\' in the Rain (1952)','
def make_df(water, nutri, mineral):
water = pd.DataFrame({'treatments': 'water', 'value': water}, index=list(range(len(water))))
nutri = pd.DataFrame({'treatments': 'nutri', 'value': nutri}, index=list(range(len(nutri))))
mineral = pd.DataFrame({'treatments': 'mineral', 'value': mineral}, index=list(range(len(mineral))))
df = pd.concat([water, nutri, mineral])
return df
make_df(water, nutri, mineral)
def nth_even(n):
return n * 2 - 2
def misclassified_messages(y_test, predicted):
misclassified = []
for index, label_predicted in enumerate(predicted):
if label_predicted != y_test[index]:
misclassified.append({'message': df.iloc[index]['Message'], 'actual': df.iloc[index]['Category'], 'predicted': label_predicted})
misclassification_df = pd.DataFrame(misclassified)
return misclassification_df
def gdp(df19):
Top1=df19[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[0:1]
Top20=df19[['Country or region', 'GDP per capita']]\
.sort_values(by='GDP per capita', ascending=False).head(20)[19:20]
dataset = pd.DataFrame({'Top1': Top1, 'Top20': Top20})
return dataset
gdp(df19)
def get_songs_from_artists(df):
# do something
return df
def top_decade(av_rating):
if av_rating[av_rating.rating == 5.0]:
av_rating['decade'].value_counts()
else:
pass
return av_rating
top_decade(av_rating)
def toCsvText(array):
return "\n".join(",".join(map(str, a)) for a in array)
import re
def rate_group(int_rate):
int_rate = re.sub('%', '', int_rate)
int_rate = float(int_rate)
if int_rate > 15.0:
return '>15'
elif 10.0 < int_rate <= 15.0:
return '10-15'
else:
return '<10'
df['rate_group'] = df['int_rate'].apply(rate_group)
df_year_leaders = year_leaders(df_performers)
def geo_class(row):
geo_data = {'center': ['Moscow', 'Tula', 'Yaroslavl'],
'Northwest': ['petersburg', 'pskov', 'murmansk'],
'Far East': ['vladivostok', 'sakhalin', 'khabarovsk']}
for key in geo_data.keys():
if row in geo_data[key]:
return key
return 'undefined'
def add_x_labels(fig, x_values, x_labels):
fig.update_xaxes(tickvals=x_values, ticktext=x_labels)
import pandas as pd
import matplotlib.pyplot as plt
data = {'water': [1, 2, 3, 4, 2, 4, 2, 4, 5, 2, 3, 4, 2, 1, 3, 4, 3, 2, 5, 1],
'nutri': [1, 2, 4, 6, 5, 6, 7, 5, 4, 5, 6, 7, 4, 3, 5, 5, 6, 5, 4, 3],
'mineral': [2, 1, 1, 3, 2, 4, 2, 4, 5, 4, 3, 2, 3, 2, 3, 1, 3, 4, 5, 1]}
data = pd.DataFrame(data, index = range(20))
plt.figure(figsize = (12, 4))
plt.bar(data.index, data['water'], color = 'r', label = 'water')
plt.bar(data.index, data['nutri'], color = 'y', label = 'nutri', bottom = data
def highlight_max_and_min(df):
df.style.highlight_max(axis=1,
props='color:white;\
font-weight:bold;\
background-color:green;')
df.style.highlight_min(axis=1,
props='color:white;\
font-weight:bold;\
background-color:brown;')
return df
def year_leaders(dfp):
dfp_copy = dfp.copy()
dfp_copy.sort_values(by=['chart_debut','num_of_hits'], ascending=False, inplace=True)
dfp_copy.set_index('chart_debut', inplace=True)
return dfp_copy
year_leaders(dfp)
import sys
import ast
import numpy as np
def parse_array(s):
return np.array(ast.literal_eval(s))
def read_array():
return parse_array(sys.stdin.readline())
def write_array(arr):
print(repr(arr. tolist()))
def calculate_conv_x_grad(x, y, kernel, bias):
"""
x - InLen x InChannels
y - OutLen x OutChannels
kernel - OutChannels x InChannels x KernelSize
bias - OutChannels
returns InLen x InChannels
"""
x = read_array()
y = read_array()
kernel = read_array()
bias = read_array()
result = calculate_conv_x_grad(x, y, kernel, bias)
write_array(result)
# Python
## Python Basics
### Basic Syntax
Python is an easy to learn, powerful programming language. It has efficient high-level data structures and a simple but effective approach to object-oriented programming. Python’s elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms.
The official Python website is www.python.org. There you will find the latest Python documentation. There are also mailing lists, newsgroups, and a number of other resources available at the site.
### Running Python
The Python interpreter is usually installed as /usr/local/bin/python3.7 on those machines where it is available; putting /usr/local/bin in your Unix shell’s search path makes it possible to start it by typing the command:
import pandas as pd
water = [25,30,28,24,34]
nutri = [36,29,45,23,30,39,28]
mineral = [55,29,56,20]
df = pd.DataFrame()
df['index'] = list(range(len(water)+len(nutri)+len(mineral)))
df['treatment'] = list(water) + list(nutri) + list(mineral)
df['treatment'] = df['treatment'].replace(water, "water")
df['treatment'] = df['treatment'].replace(nutri, "nutri")
df['treatment'] = df['treatment'].replace(mineral, "mineral")
df
def unique_id(df, col1, col2, col3):
return df[col1]+df[col2]+df[col3]
from sklearn.preprocessing import StandardScaler
st_scaler = StandardScaler()
sl = np.array(sl).reshape(-1, 1)
sw = np.array(sw).reshape(-1, 1)
sl = st_scaler.fit_transform(sl)
sw = st_scaler.fit_transform(sw)
print(sl)
print(sw)
def res(arr):
suma = 0
for i in range(len(arr)):
for j in range(i+1, len(arr)):
suma += arr[i] + arr[j]
return suma
res(arr)
def add_money(winnums):
for i, char in enumerate(winnums):
winnums[i] += 1
return winnums
add_money(winnums)
sns.factorplot(x='date', y='rating', col='hotel', data=hotels_rating, col_wrap=3,
kind='bar', size=4, aspect=.9)
plt.suptitle('hotel rating change',size=16)
plt.subplots_adjust(top=.9)
def sum_matrix(N):
my_matrix = np.diag(np.arange(N-1, -1, -1), k=0)
return np.sum(my_matrix)
print(sum_matrix(5))
print(sum_matrix(10))
print(sum_matrix(15))
def how_much_water(max_load, load, clothes):
return max_load * 1.1 ** (clothes - load)
from scipy.stats import ttest_ind
sync = [85.1, 83.8, 69.9, 82.1, 84.4, 80.4, 78.1, 88.4, 77. , 91.5, 76.7, 86.6, 91.8, 73.3,
83.9, 76.7, 85.8, 89.6, 91.7, 87.2, 79. , 85.3]
asyncr = [89.8, 81.6, 87.4, 81., 66.9, 72.5, 78.4, 68.5, 78.3, 62.6, 73.7, 77.7, 63., 77.5]
ttest_ind(sync, asyncr)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf, cats, test_size=0.3, random_state=42)
def get_shop_list_by_dishes(dishes, person_count):
shop_list = {}
for dish in dishes:
for ingridient in cook_book[dish]:
new_shop_list_item = dict(ingridient)
new_shop_list_item['quantity'] *= person_count
if new_shop_list_item['ingridient_name'] not in shop_list:
shop_list[new_shop_list_item['ingridient_name']] = new_shop_list_item
else:
shop_list[new_shop_list_item['ingridient_name']]['quantity'] +=\
new_shop_list_item['quantity']
return shop_list
def print_shop_list(shop_list):
for shop_list_item in shop_list.values():
print('{} {} {}'.format(shop_list_item['ingridient_name'], shop_list_item['quantity'],
shop
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pandas-videos/master/data/ufo.csv')
df.head()
df.shape
df.info()
df1 = df[df.City != '%']
df1.shape
df1.head()
import numpy as np
def diagonal_matrix(n):
arr = np.arange(1, n+1)
return np.diag(arr)
print(diagonal_matrix(4))
def year(df):
df["year"] = df["Last Updated"].apply(lambda x: x[:4])
return df
import numpy as np
import scipy.stats as st
import seaborn as sns
import matplotlib.pyplot as plt
def difference_of_means(data_1, data_2):
"""Difference in means of two arrays."""
# The difference of means of data_1, data_2: diff
diff = np.mean(data_1) - np.mean(data_2)
return diff
def permutation_sample(data1, data2):
"""Generate a permutation sample from two data sets."""
# Concatenate the data sets: data
data = np.concatenate((data1, data2))
# Permute the concatenated array: permuted_data
permuted_data = np.random.permutation(data)
# Split the permuted array into two: perm_sample_1, perm_sample_2
perm_sample_1 = permuted_data[:len(data1)]
perm_sample_2 = permuted_data[
misclassified = []
For index, label_predicted in enumerate(predicted):
if label_predicted != y_test[index]:
misclassified.append({'message': df.iloc[index]['message'], 'actual': df.iloc[index]['label'], 'predicted': label_predicted})
misclassification_df = pd.DataFrame(misclassified)
def get_students(gr, dct=dct):
students = []
for k, v in dct.items():
if gr == v[4]:
students.append(v[0] + ' ' + v[1] + ' ' + v[2])
return sorted(students)
get_students("BST161")
def class_delete(df, Class1, Class2):
df = df.loc[df['Class 1'] != Class1]
df = df.loc[df['Class 2'] != Class2]
return df
df = class_delete(df, 'Rock', 'Rock')
def replace(stnums, students):
for i in stnums:
for j in students:
if i == j[0]:
students.remove(j)
return students
def distance(a, b):
return (a.x - b.x)**2 + (a.y - b.y)**2
import pandas as pd
data = pd.read_csv('pokemon.csv')
def delete(data):
for i in range(len(data)):
if data['Class 1'][i] == 'Rock' or data['Class 2'][i] == 'Rock':
data = data.drop([i])
return data
delete(data)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.DataFrame({'place': [1, 2, 3, 4, 5], 'decade': ['2010-2020','1900-1910','1970-1980','2000-2010','1960-1970'], 'perc_of_5star': [2.300, 1.379, 1.179, 1.176, 1.133]})
plt.barh(df['decade'], df['perc_of_5star'])
plt.xlabel('Percentage of 5 star reviews')
plt.ylabel('Decades')
plt.title('Percentage of 5 star reviews by decade')
plt.show()
import os
def read_files_in_directory(directory):
for filename in os.listdir(directory):
if not filename.endswith('.txt'):
continue
with open(filename, 'r') as f:
lines = [l.strip() for l in f.readlines()]
print(lines)
return lines
read_files_in_directory(r'C:\Users\USER\Desktop\Python\Python Basics')
def del_doc_by_num(doc_num):
for document in documents:
if document['number'] == doc_num:
documents.remove(document)
return
print('Document not found')
del_doc_by_num('10006')
def max_key(dct, key):
return max(key)
max_key('key')
def df_to_string(df):
df.columns = df.columns.astype(str)
def student_list(gr):
lst = []
for i in dct.keys():
if gr == dct[i][-1]:
lst.append(dct[i][:3])
lst.sort(key=lambda x: x[0])
lst.sort(key=lambda x: x[1])
return lst
import matplotlib.pyplot as plt
def happinessgraph():
hapscore.plot(kind='line')
plt.xlabel("Years")
plt.ylabel("Happiness Score")
plt.title("Graph of Happiness Score over the Years")
plt.show()
def mean_values(df):
return df.mean().to_frame().T
pd.DataFrame({'id': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
'title': ['Pulp Fiction (1994)',
'Three Colors: Red (Trois couleurs: Rouge) (1994)',
'Three Colors: Blue (Trois couleurs: Bleu) (1993)',
'Underground (1995)',
"Singin' in the Rain (1952)",
'Dirty Dancing (1987)',
'Delicatessen (1991)',
'Ran (1985)',
'Seventh Seal, The (Sjunde inseglet, Det) (1957)',
'Bridge on the River Kwai, The (1957)'],
'rating': [5.0, 3.5, 5.0, 5.0, 3.5, 4.0, 3.5, 3.5, 5.0, 4.0]})
df_filtered = df.groupby('userId').filter(lambda x: len(x) >= 100)
def box_plots(A, B):
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
ax.boxplot([A, B], labels=['A', 'B'], patch_artist=True,
boxprops=dict(facecolor='lightblue', color='black', linewidth=1),
medianprops=dict(color='black'))
plt.show()
box_plots(A, B)
df_new["operator"].str.len().mean()
def merge(df1, df2, df3):
return pd.concat([df1, df2, df3], axis=0)
from sklearn.metrics import f1_score
y_pred = lda.predict(X_test)
f1_score(y_test, y_pred)
def str_to_float(train, test):
train = [float(x) for x in train]
test = [float(x) for x in test]
return train, test
X_train, X_test = str_to_float(X_train, X_test)
y_train, y_test = str_to_float(y_train, y_test)
def plot_bar(df, question, title_text, xlabel, ylabel, orientation='h'):
question = df[question].value_counts()
label = question.index
counts = question.values
fig = px.bar(x=counts, y=label, orientation='h')
fig.update_layout(title_text=title_text)
fig.show()
df = pd.read_csv('https://raw.githubusercontent.com/GODKarma/Data-Analytics-2020/master/Data/survey_results_public.csv', index_col='Respondent')
plot_bar(df, 'LanguageWorkedWith', 'С какими языками программирования вы работали?', 'Количество', 'Языки')
def calc_av_temp(df):
df = df[df['year']>1980]
df = df.groupby('country').agg({'av_temp': np.mean})
return df.sort_values('av_temp').head(20)