tf_sharepoint_integration/test_document_parser.py
Daniel Grozdanovic bcd0f8a227
Some checks failed
CI - SharePoint Plugin with SonarQube / Test and SonarQube Analysis (push) Has been cancelled
Initial commit: SharePoint connector and ToothFairyAI integration
2026-02-22 17:58:45 +02:00

146 lines
5.3 KiB
Python

"""
Unit tests for document_parser.py
"""
import pytest
from document_parser import DocumentParser, get_file_info
class TestDocumentParser:
"""Test DocumentParser class."""
def setup_method(self):
"""Set up test fixtures."""
self.parser = DocumentParser()
def test_can_parse_supported_extensions(self):
"""Test can_parse returns True for supported file types."""
supported_files = [
'document.txt', 'readme.md', 'data.csv', 'config.json',
'report.pdf', 'document.docx', 'spreadsheet.xlsx', 'slides.pptx',
'script.py', 'code.java', 'style.css', 'index.html'
]
for filename in supported_files:
assert self.parser.can_parse(filename), f"Should parse {filename}"
def test_can_parse_unsupported_extensions(self):
"""Test can_parse returns False for unsupported file types."""
unsupported_files = [
'image.png', 'video.mp4', 'audio.mp3', 'archive.zip',
'binary.exe', 'document.doc'
]
for filename in unsupported_files:
assert not self.parser.can_parse(filename), f"Should not parse {filename}"
def test_get_extension(self):
"""Test _get_extension method."""
assert self.parser._get_extension('file.txt') == '.txt'
assert self.parser._get_extension('FILE.TXT') == '.txt'
assert self.parser._get_extension('archive.tar.gz') == '.gz'
assert self.parser._get_extension('noextension') == ''
def test_parse_text_utf8(self):
"""Test parsing UTF-8 text files."""
content = "Hello World\nThis is a test".encode('utf-8')
result = self.parser.parse(content, 'test.txt')
assert result == "Hello World\nThis is a test"
def test_parse_text_multiple_encodings(self):
"""Test parsing text with different encodings."""
content = "Test content"
# UTF-8
result = self.parser._parse_text(content.encode('utf-8'))
assert result == "Test content"
# Latin-1
result = self.parser._parse_text(content.encode('latin-1'))
assert result == "Test content"
def test_parse_unsupported_file_raises_error(self):
"""Test parsing unsupported file type raises ValueError."""
with pytest.raises(ValueError, match="Unsupported file type"):
self.parser.parse(b"content", "file.exe")
def test_parse_json(self):
"""Test parsing JSON files."""
content = '{"key": "value", "number": 123}'.encode('utf-8')
result = self.parser.parse(content, 'data.json')
assert '"key": "value"' in result
assert '"number": 123' in result
def test_parse_csv(self):
"""Test parsing CSV files."""
content = "name,age,city\nAlice,30,NYC\nBob,25,LA".encode('utf-8')
result = self.parser.parse(content, 'data.csv')
assert "Alice" in result
assert "30" in result
assert "NYC" in result
class TestGetFileInfo:
"""Test get_file_info function."""
def test_document_category(self):
"""Test document file type categorization."""
info = get_file_info('report.pdf', 1024)
assert info['category'] == 'document'
assert info['extension'] == '.pdf'
def test_spreadsheet_category(self):
"""Test spreadsheet file type categorization."""
info = get_file_info('data.xlsx', 2048)
assert info['category'] == 'spreadsheet'
assert info['extension'] == '.xlsx'
def test_presentation_category(self):
"""Test presentation file type categorization."""
info = get_file_info('slides.pptx', 4096)
assert info['category'] == 'presentation'
def test_code_category(self):
"""Test code file type categorization."""
info = get_file_info('script.py', 512)
assert info['category'] == 'code'
def test_image_category(self):
"""Test image file type categorization."""
info = get_file_info('photo.jpg', 8192)
assert info['category'] == 'image'
def test_file_size_bytes(self):
"""Test file size formatting in bytes."""
info = get_file_info('small.txt', 512)
assert info['size_formatted'] == '512 B'
assert info['size_bytes'] == 512
def test_file_size_kilobytes(self):
"""Test file size formatting in KB."""
info = get_file_info('medium.txt', 2048)
assert 'KB' in info['size_formatted']
assert info['size_bytes'] == 2048
def test_file_size_megabytes(self):
"""Test file size formatting in MB."""
info = get_file_info('large.pdf', 5 * 1024 * 1024)
assert 'MB' in info['size_formatted']
assert '5.0' in info['size_formatted']
def test_file_size_gigabytes(self):
"""Test file size formatting in GB."""
info = get_file_info('huge.zip', 2 * 1024 * 1024 * 1024)
assert 'GB' in info['size_formatted']
assert '2.0' in info['size_formatted']
def test_unknown_extension(self):
"""Test unknown file extension."""
info = get_file_info('file.xyz', 1024)
assert info['category'] == 'file'
assert info['extension'] == '.xyz'
def test_no_extension(self):
"""Test file with no extension."""
info = get_file_info('README', 1024)
assert info['extension'] == ''