Ich habe 2000+ Datenrahmen mit zwei Spalten. Ich möchte Nagrams für die Spalten erstellen und dann einen neuen Datenrahmen mit Ngrammen erstellen. Hier ist mein Code. Es funktioniert gut. Nehmen Sie sich nur viel Zeit.Verwenden von vectorize oder Apply anstelle von iterrows auf Pandas Datenrahmen in Python
Ich verwende derzeit iterows, um durch jede Zeile jedes Datenrahmens in jeder Datei zu durchlaufen. Gibt es einen einfacheren Weg dies mit Vektorisierung zu tun oder anzuwenden?
import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import psutil
import numpy as np
import pandas as pd
import time
def create_combinations(file):
initial_path ='./to_process/'
final_path = './processed/'
custom = pd.read_pickle(initial_path+file, compression='gzip')
custom = custom.drop_duplicates(subset=['category', 'element'])
custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
total_rows = len(custom.index)
logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
# if total_rows > cores:
# partitions = math.floor(total_rows/cores)
# logging.warning('Number of partitions : ' + str(partitions))
if total_rows > 0:
combined_df = pd.DataFrame(columns=['category', 'element'])
logging.warning('creating combinations')
for key, data in custom.iterrows():
words = data['element']#.split()
logging.warning(words)
words2 = words.replace('%', '%%').replace(' ', '%s')
logging.warning('Number of words to combine: '+ str(len(words.split())))
k = 0
df1 = pd.DataFrame(columns=['category', 'element'])
for i in itertools.product((' ', ''), repeat=words.count(' ')):
df1.loc[k, 'element'] = (words2 % i)
df1.loc[k, 'category'] = data['category']
k += 1
combined_df = pd.concat([combined_df,df1], axis=0)
del df1
combined_df.to_pickle(final_path + file, compression='gzip')
combined_df.to_csv(final_path + os.path.splitext(file)[0]+'.csv')
del combined_df
del custom
# partitions = 1
logging.warning('completed ' + file)
else:
logging.warning('No rows to process')
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
partitions = 1 #number of partitions to split dataframe
cores = 2 #number of cores on your machine
path ='./to_process/'
combi_path = './processed/'
files = [f for f in listdir(path) if isfile(join(path, f))]
pickle_files=[]
for any_file in files:
if any_file.endswith('.pickle'):
if os.path.isfile(combi_path+any_file):
logging.warning(any_file +' already processed.')
else:
pickle_files.insert(len(pickle_files),any_file)
p = multiprocessing.Pool(processes = len(pickle_files))
start = time.time()
async_result = p.map_async(create_combinations, pickle_files)
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
Code eingeben hier