import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
使用基于 CNN 的图像分类进行 Web 应用程序垃圾分类:https://github.com/vladalexey/webapp-trash-classification
代码片段:
import numpy as np import matplotlib.pyplot as plt from keras.datasets import cifar10 from keras.models import Sequential from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout from keras.utils import np_utils
# Load the dataset (X_train, y_train), (X_test, y_test) = ‘dataset’.load_data()
import nltk nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') import string import re import pandas as pd from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report
# Load the dataset data = pd.read_csv('tweets.csv', encoding='latin-1', header=None)
# Assign new column names to the DataFrame column_names = ['target', 'id', 'date', 'flag', 'user', 'text'] data.columns = column_names
# Preprocess the text data stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer()
defpreprocess_text(text): # Remove URLs, usernames, and hashtags text = re.sub(r'http\S+', '', text) text = re.sub(r'@\w+', '', text) text = re.sub(r'#\w+', '', text)
# Remove punctuation and convert to lowercase text = text.translate(str.maketrans('', '', string.punctuation)) text = text.lower()
# Tokenize the text and remove stop words tokens = word_tokenize(text) filtered_tokens = [token for token in tokens if token notin stop_words]
# Lemmatize the tokens lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
# Join the tokens back into text preprocessed_text = ' '.join(lemmatized_tokens) return preprocessed_text
# Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.2, random_state=42)
# Vectorize the text data count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Train the model clf = MultinomialNB().fit(X_train_tfidf, y_train)
# Test the model X_test_counts = count_vect.transform(X_test) X_test_tfidf = tfidf_transformer.transform(X_test_counts) y_pred = clf.predict(X_test_tfidf)
# Print the classification report print(classification_report(y_test, y_pred))
import pandas as pd import nltk nltk.download('vader_lexicon') from nltk.sentiment import SentimentIntensityAnalyzer
# Load the Netflix dataset netflix_data = pd.read_csv('netflix_titles.csv', encoding='iso-8859-1')
# Create a new column for sentiment scores of movie and TV show titles sia = SentimentIntensityAnalyzer() netflix_data['sentiment_scores'] = netflix_data['Title'].apply(lambda x: sia.polarity_scores(x))
# Extract the compound sentiment score from the sentiment scores dictionary netflix_data['sentiment_score'] = netflix_data['sentiment_scores'].apply(lambda x: x['compound'])
# Group the data by language and calculate the average sentiment score for movies and TV shows in each language language_sentiment = netflix_data.groupby('Language')['sentiment_score'].mean()
# Print the top 10 languages with the highest average sentiment score for movies and TV shows print(language_sentiment.sort_values(ascending=False).head(10))
输出:
7. 使用 K-Means 聚类进行客户细分
客户细分是数据科学最重要的应用之一。此 GitHub 数据科学项目将要求你使用 K 聚类算法。这种流行的无监督机器学习算法根据相似性将数据点聚类为 K 个簇。
问题陈述
该项目的目标是使用 K 均值聚类算法,根据年收入、消费习惯等特定因素对访问购物中心的顾客进行细分。
项目和数据集的简要概述
该项目将要求你收集数据、进行初步研究和数据预处理,并训练和测试 K 均值聚类模型来细分客户。你可以使用商城客户细分上的数据集,其中包含 5 个特征(客户 ID、性别、年龄、年收入和消费分数)以及 200 个客户的相应信息。
在客户数据上演示 k 均值算法:https://github.com/mayursrt/customer-segmentation-using-k-means
代码片段:
import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler
# Load the customer data customer_data = pd.read_csv('customer_data.csv') customer_data = customer_data.drop('Gender', axis=1) # Standardize the data scaler = StandardScaler() scaled_data = scaler.fit_transform(customer_data)
# Find the optimal number of clusters using the elbow method wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42) kmeans.fit(scaled_data) wcss.append(kmeans.inertia_) plt.plot(range(1, 11), wcss) plt.title('Elbow Method') plt.xlabel('Number of Clusters') plt.ylabel('WCSS') plt.show()
# Perform K-Means clustering with the optimal number of clusters kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42) kmeans.fit(scaled_data)
# Add the cluster labels to the original DataFrame customer_data['Cluster'] = kmeans.labels_
# Plot the clusters based on age and income plt.scatter(customer_data['Age'], customer_data['Annual Income (k$)'], c=customer_data['Cluster']) plt.title('Customer Segmentation') plt.xlabel('Age') plt.ylabel('Income') plt.show()
import tensorflow as tf from tensorflow.keras.preprocessing.image import ImageDataGenerator from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
# Set up data generators for training and validation sets train_datagen = ImageDataGenerator(rescale=1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True) train_generator = train_datagen.flow_from_directory('train_dir', target_size=(128, 128), batch_size=32, class_mode='binary') val_datagen = ImageDataGenerator(rescale=1./255) val_generator = val_datagen.flow_from_directory('val_dir', target_size=(128, 128), batch_size=32, class_mode='binary')
# Build a convolutional neural network for medical diagnosis model = Sequential() model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3))) model.add(MaxPooling2D((2, 2))) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(MaxPooling2D((2, 2))) model.add(Conv2D(128, (3, 3), activation='relu')) model.add(MaxPooling2D((2, 2))) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Dense(1, activation='sigmoid'))
# Compile the model model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model on the training set and evaluate it on the validation set history = model.fit(train_generator, epochs=10, validation_data=val_generator)
# Plot the training and validation accuracy and loss curves plt.plot(history.history['accuracy'], label='Training Accuracy') plt.plot(history.history['val_accuracy'], label='Validation Accuracy') plt.title('Training and Validation Accuracy') plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.legend() plt.show()
import os import librosa import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from keras import models, layers
# Set up paths to audio files and genre labels AUDIO_PATH = 'audio' CSV_PATH = 'data.csv'
# Load audio files and extract features using librosa defextract_features(file_path): audio_data, _ = librosa.load(file_path, sr=22050, mono=True, duration=30) mfccs = librosa.feature.mfcc(y=audio_data, sr=22050, n_mfcc=20) chroma_stft = librosa.feature.chroma_stft(y=audio_data, sr=22050) spectral_centroid = librosa.feature.spectral_centroid(y=audio_data, sr=22050) spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=22050) spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_data, sr=22050) features = np.concatenate((np.mean(mfccs, axis=1), np.mean(chroma_stft, axis=1), np.mean(spectral_centroid), np.mean(spectral_bandwidth), np.mean(spectral_rolloff))) return features
# Load data from CSV file and extract features data = pd.read_csv(CSV_PATH) features = [] labels = [] for index, row in data.iterrows(): file_path = os.path.join(AUDIO_PATH, row['filename']) genre = row['label'] features.append(extract_features(file_path)) labels.append(genre)
# Encode genre labels and scale features encoder = LabelEncoder() labels = encoder.fit_transform(labels) scaler = StandardScaler() features = scaler.fit_transform(np.array(features, dtype=float))
# Split data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2)
# Build a neural network for music genre classification model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(train_features.shape[1],))) model.add(layers.Dropout(0.3)) model.add(layers.Dense(128, activation='relu')) model.add(layers.Dropout(0.2)) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dropout(0.1)) model.add(layers.Dense(10, activation='softmax'))
# Compile the model model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train the model on the training set and evaluate it on the testing set history = model.fit(train_features, train_labels, epochs=50, batch_size=128, validation_data=(test_features, test_labels))
# Plot the training and testing accuracy and loss curves plt.plot(history.history['accuracy'], label='Training Accuracy') plt.plot(history.history['val_accuracy'], label='Testing Accuracy') plt.title('Training and Testing Accuracy') plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.legend() plt.show()
import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, confusion_matrix
# Load data from CSV file data = pd.read_csv('credit_data.csv')
# Clean data by removing missing values data.dropna(inplace=True)
# Split data into features and labels features = data[['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income', 'num_of_accounts', 'derogatory_marks', 'total_debt']] labels = data['loan_status']
# Scale features to have zero mean and unit variance scaler = StandardScaler() features = scaler.fit_transform(features)
# Split data into training and testing sets train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2)
# Build a logistic regression model for credit risk prediction model = LogisticRegression()
# Train the model on the training set model.fit(train_features, train_labels)
# Predict labels for the testing set predictions = model.predict(test_features)