comment on

Yet, another attempt with MCE-like chunking for more apple-to-apple comparison. Workers seek to offset position and slurp the whole chunk.

Last updated on March 28, 2025.

Python parallel demonstration

#!/usr/bin/env python
# time NUM_THREADS=3 python pcount2.py big

import os, re, sys

from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Pool, cpu_count, Lock

if len(sys.argv) < 2: sys.exit(1)

mul_re = re.compile(r"mul\(\d{1,3},\d{1,3}\)")
lock = Lock()

def process_chunk(params):
    """
    Worker function to process chunks in parallel.
    Read IO is serial to not cause a denial of service
    for SAN storage. Comment out locking for parallel IO.
    """
    file_name, chunk_start, chunk_end = params
    with open (file_name, "r") as f:
        f.seek(chunk_start)
 #      lock.acquire()
        chunk = f.read(chunk_end - chunk_start)
 #      lock.release()
        count = len(mul_re.findall(chunk))
        return count

def parallel_read(file_name):
    num_processes = int(os.getenv('NUM_THREADS') or cpu_count())

    def gen_offsets():
        # Emit next offset input [ file_name, chunk_start, chunk_end ]
        try:
            file_size = os.path.getsize(file_name)
        except Exception as e:
            print(e, file=sys.stderr);
            sys.exit(1)

        chunk_size = 65536 * 16
        position = 0

        with open(file_name, 'r') as f:
            while True:
                chunk_start = position
                if chunk_start > file_size - 1:
                    break
                if chunk_start + chunk_size <= file_size:
                    f.seek(chunk_start + chunk_size - 1)
                    if f.read(1) == '\n':
                        # Chunk ends with linefeed
                        chunk_end = chunk_start + chunk_size
                        position += chunk_size
                    else:
                        # Include the rest of line
                        length = len(f.readline())
                        chunk_end = chunk_start + chunk_size + length
                        position += chunk_size + length
                else:
                    position = chunk_end = file_size

                yield [ file_name, chunk_start, chunk_end ]

    # Run chunks in parallel and tally count

    count = 0

#   # Map possibly slower due to overhead preserving order
#   with Pool(num_processes) as p:
#       results = p.map(process_chunk, gen_offsets())
#       count = sum(results)

    # Try imap_unordered when ordered results unnecessary
    with Pool(num_processes) as p:
        results = list(p.imap_unordered(process_chunk, gen_offsets()))
        count = sum(results)

#   # Try also, concurrent.futures
#   with ProcessPoolExecutor(max_workers=num_processes) as executor:
#       futures = [executor.submit(process_chunk, params) \
#           for params in gen_offsets()]
#       for future in as_completed(futures):
#           count += future.result()

    return count

count = parallel_read(sys.argv[1])
print(f"Found {count} matches.")
[download]

Results

Found 1999533 matches.

1: 2.628s
2: 1.342s
3: 0.914s
4: 0.703s
5: 0.571s
6: 0.482s
7: 0.423s
8: 0.375s
9: 0.339s
[download]

In reply to Re^4: Python regex faster than Perl? - Chunking 1 GB parallel by marioroy
in thread Python regex faster than Perl? by dave93

Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!

Titles consisting of a single word are discouraged, and in most cases are disallowed outright.

Read Where should I post X? if you're not absolutely sure you're posting in the right place.

Please read these before you post! —

Posts may use any of the Perl Monks Approved HTML tags:

a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, details, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, summary, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr

You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)

	For:		Use:
	&		`&`
	<		`<`
	>		`>`
	[		`[`
	]		`]`

Link using PerlMonks shortcuts! What shortcuts can I use for linking?

See Writeup Formatting Tips and other pages linked from there for more info.